# Task 2: Recommendation Engine

### Setting up the Notebook

In [1]:
import numpy as np
import pandas as pd

from src.utils import read_csv
from src.data_preprocessor import DataPreprocessor

from src.recommendation_utils import get_recommendation_weights

In [2]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

### Load the Data

In [3]:
trainX, trainY = read_csv('data/train.csv', ylabel='price')
data_preprocessor = DataPreprocessor()
trainX, trainY = data_preprocessor.fit_transform_for_recommendations(trainX, trainY)

df = pd.concat([trainX, trainY], axis=1)

## Recommendation Setup

### Features for Property Similarity

| Feature | Description | Smallest Value | Largest Value | Continuous |
| ------- | ----------- | - | - | ---------- |
| subzone | Boolean variable to represent if the properties have the same subzone | 0 : Different subzones | 1 : Same subzone | No |
| planning_area | Boolean variable to represent if the properties have the same planning area | 0 : Different planning areas | 0.5 : Same planning area | No |
| price | Similarity score between prices (Gaussian kernel on top of L1 distance) | 0 | 1 : Same price | Yes |
| property_type | Boolean variable to represent if the properties have the same property type | 0 : Different property type | 1 : Same property type | No |
| num_beds | Similarity score between number of beds (Gaussian kernel on top of L1 distance) | -1 : Lesser number of bedrooms | 1 : Same number of bedrooms | Yes |
| num_beds | Boolean variable to represent if the properties have the same number of bedrooms | 0 : Different number of bedrooms | 1 : Same number of bedrooms | No |
| price_per_size_sqft | Similarity score between the price per size sqft (Gaussian kernel on top of L1 distance) | 0 | 1 : Same property size | Yes |
| floor_level | Boolean variable to represent if the properties are at the same floor level | 0 : Different floor levels, or floor level of input row is NaN | 1 : Same floor level | No |
| furnishing | Boolean variable to represent if the properties have the same level of furnishing | 0 : Different levels of furnishing, or furnishing of input row is unspecified | 1 : Same level of furnishing | No |
| tenure_left | Similarity score between the number of years left in the tenure lease (Gaussian kernel on top of L1 distance) | 0 | 1 : Same numbers of years left | Yes |

### User Input and Preferences

<img src="images/99co_recommendation.jpg" style="width: 700px;"/>

[99.co](99.co) shows four possible priorities that the user can provide to sort 'similar listings'. We adapt the same in our setup. While the `get_top_recommendations` is capable of handling highly nuanced weightage of each feature, we provide xxx pre-defined settings below. Uncomment any one to continue, or make no changes to see the universal recommendations.

In [15]:
feature_list = ['subzone', 'planning_area', 'price',
                'property_type', 'num_beds', 'num_baths', 'price_per_sqft',
                'floor_level', 'furnishing', 'tenure_left']
feature_weightage = {k:1 for k in feature_list}

########## Prioritize 'nearby' property ##########
feature_weightage['subzone'] = 10
feature_weightage['planning_area'] = 10

########## Prioritize units with similar 'price' ##########
# feature_weightage['price'] = 10

########## Prioritize units with similar 'price_per_sqft' ##########
# feature_weightage['price_per_sqft'] = 10
# feature_weightage['price'] = 5

########## Prioritize units with similar 'property_type' ##########
# feature_weightage['property_type'] = 10
# feature_weightage['num_beds'] = 5
# feature_weightage['num_baths'] = 5


## Computing the Top Recommendations

In [5]:
np.random.seed(hash("99.co")%100)

In [6]:
def get_top_recommendations(row, df, feature_weightage, k=3):

    ######## Remove input from dataframe (To stop recommending the input itself)
    df = df.drop(row.name)
    df_weights = np.maximum(get_recommendation_weights(row, df, feature_weightage), 0)

    df_properties = pd.concat([df, pd.DataFrame(df_weights, columns=['recommendation_score'])], axis=1).sort_values('recommendation_score', ascending=True)
    df_property_groups = df_properties.groupby(['property_name']).agg({'recommendation_score': ['max', 'idxmax']})

    df_weights = df_property_groups[('recommendation_score', 'max')]
    df_index = df_property_groups[('recommendation_score', 'idxmax')]
    ######## Filter Out Top Properties (More Than Requested)
    top_property_indices = df_weights.argsort()[-5*k:]

    # ######## Introduce Randomness in Choice From The Top Recommendations
    prob = df_weights[top_property_indices]
    prob = prob/prob.sum()
    rec_indices = df_index[np.random.choice(df_property_groups.index[top_property_indices], size=k, replace=False, p=prob)]

    return np.array(sorted(rec_indices, key=lambda k: (df_properties['recommendation_score'][k], k), reverse=True))

## Testing the Recommendation Engine

### Pick a Sample Listing as Input

In [13]:
# Pick a row id of choice
row_id = 10
row_id = 20
#row_id = 30
#row_id = 40
#row_id = 50

# Get the row from the dataframe (an valid row ids will throw an error)
row = df.iloc[row_id]

# Just for printing it nicely, we create a new dataframe from this single row
pd.DataFrame([row])

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,tenure_left,price_per_sqft,price
20,1 silat avenue,avenue south residence,condo,99-year leasehold,2023.0,4.0,4.0,1496,,unspecified,1074.0,1.304855,103.773776,dover,queenstown,99.0,False,99.0,1824.86631,2730000.0


### Compute and Display the recommendations

In [16]:
k = 10

recommendation_list = get_top_recommendations(row, df, feature_weightage, k=k)

df.iloc[recommendation_list]

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,tenure_left,price_per_sqft,price
841,13 silat avenue,avenue south residence,condo,99-year leasehold,2023.0,4.0,4.0,1496,,unspecified,1074.0,1.304855,103.773776,dover,queenstown,99.0,False,99.0,1857.152406,2778300.0
2513,buona vista / west coast / clementi (d5),dover ville,hdb,99-year leasehold,2003.0,3.0,2.0,936,,unspecified,,1.30779,103.783725,dover,queenstown,99.0,False,80.0,794.230769,743400.0
776,5 holland village way,one holland village residences / one holland v...,condo,99-year leasehold,2025.0,4.0,4.0,2088,,unspecified,296.0,1.311628,103.793885,holland drive,queenstown,99.0,False,99.0,1961.206897,4095000.0
14190,360 pasir panjang road,gold coast condominium,condo,freehold,1994.0,4.0,3.0,1895,,unspecified,67.0,1.290663,103.774857,pasir panjang 1,queenstown,10000.0,True,9972.0,1872.823219,3549000.0
9375,33 rochester drive,the rochester residences,apartment,99-year leasehold,2011.0,4.0,3.0,2271,penthouse,unspecified,334.0,1.305172,103.788336,one north,queenstown,99.0,False,88.0,1954.821664,4439400.0
4471,west coast park,hong leong garden,semi-detached house,956-year leasehold,,4.0,3.0,4198,,unspecified,,1.288358,103.773271,port,queenstown,956.0,False,956.0,1625.774178,6825000.0
7670,tanglin / holland (d10),holland road shopping centre,semi-detached house,,,5.0,6.0,5700,,unspecified,,1.310413,103.795475,holland drive,queenstown,,False,,1492.105263,8505000.0
15685,20 margaret drive,margaret ville,condo,99-year leasehold,2021.0,2.0,2.0,700,,unspecified,309.0,1.296092,103.810835,margaret drive,queenstown,99.0,False,98.0,2399.714286,1679800.0
8426,463 pasir panjang road,village @ pasir panjang,condo,freehold,2016.0,3.0,2.0,1012,low,unspecified,148.0,1.291129,103.770138,pasir panjang 1,queenstown,10000.0,True,9994.0,1826.086957,1848000.0
905,41 pasir panjang hill,horizon residences,condo,freehold,2014.0,3.0,2.0,1356,low,unspecified,72.0,1.281934,103.784142,pasir panjang 1,queenstown,10000.0,True,9992.0,2094.616519,2840300.0
