# Task 2: Recommendation Engine

### Setting up the Notebook

In [1]:
import numpy as np
import pandas as pd

from src.utils import read_csv
from src.data_preprocessor import DataPreprocessor
from src.preprocessor_utils import remove_columns, convert_to_lowercase

from src.recommendation_utils import get_recommendation_weights

In [2]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

### Load the Data

In [3]:
trainX, trainY = read_csv('data/train.csv', ylabel='price')
data_preprocessor = DataPreprocessor()
trainX, trainY = data_preprocessor.fit_transform_for_recommendations(trainX, trainY)

df = pd.concat([trainX, trainY], axis=1)

## Recommendation Setup

### Features for Property Similarity

| Feature | Description | Smallest Value | Largest Value | Continuous |
| ------- | ----------- | - | - | ---------- |
| subzone | Boolean variable to represent if the properties have the same subzone | 0 : Different subzones | 1 : Same subzone | No |
| planning_area | Boolean variable to represent if the properties have the same planning area | 0 : Different planning areas | 0.5 : Same planning area | No |
| price | Similarity score between prices (Gaussian kernel on top of L1 distance) | 0 | 1 : Same price | Yes |
| property_type | Boolean variable to represent if the properties have the same property type | 0 : Different property type | 1 : Same property type | No |
| num_beds | Boolean variable to represent if the properties have the same number of bedrooms | 0 : Different number of bedrooms | 1 : Same number of bedrooms | No |
| size_sqft | Similarity score between property size (Gaussian kernel on top of L1 distance) | 0 | 1 : Same property size | Yes |
| floor_level | Boolean variable to represent if the properties are at the same floor level | 0 : Different floor levels, or floor level of input row is NaN | 1 : Same floor level | No |
| furnishing | Boolean variable to represent if the properties have the same level of furnishing | 0 : Different levels of furnishing, or furnishing of input row is unspecified | 1 : Same level of furnishing | No |
| tenure_left | Similarity score between the number of years left in the tenure lease (Gaussian kernel on top of L1 distance) | 0 | 1 : Same numbers of years left | Yes |

### User Input and Preferences

<img src="images/99co_recommendation.jpg" style="width: 700px;"/>

[99.co](99.co) shows four possible priorities that the user can provide to sort 'similar listings'. We adapt the same in our setup. While the `get_top_recommendations` is capable of handling highly nuanced weightage of each feature, we provide xxx pre-defined settings below. Uncomment any one to continue, or make no changes to see the universal recommendations.

In [4]:
feature_list = ['subzone', 'planning_area', 'price',
                'property_type', 'num_beds', 'num_baths', 'price_per_sqft',
                'floor_level', 'furnishing', 'tenure_left']
feature_weightage = {k:1 for k in feature_list}

########## Prioritize 'nearby' property ##########
# feature_weightage['subzone'] = 10
# feature_weightage['planning_area'] = 10

########## Prioritize units with similar 'price' ##########
# feature_weightage['price'] = 10

########## Prioritize units with similar 'size_sqft' ##########
feature_weightage['price_per_sqft'] = 1

########## Prioritize units with similar 'property_type' ##########
feature_weightage['property_type'] = 1

feature_weightage['num_beds'] = 5


## Computing the Top Recommendations

In [5]:
np.random.seed(hash("99.co")%100)

In [6]:
def get_top_recommendations(row, df, feature_weightage, k=3):

    ######## Remove input from dataframe (To stop recommending the input itself)
    df = df.drop(row.name)
    
    ######## Get Recommendation Weights For Each Property
    df_weights = np.maximum(get_recommendation_weights(row, df, feature_weightage), 0)

    ######## Filter Out Top Properties (More Than Requested)
    top_property_indices = df_weights.argsort()[-5*k:]
    
    ######## Introduce Randomness in Choice From The Top Recommendations
    prob = df_weights[top_property_indices]
    prob = prob/prob.sum()
    rec_indices = np.sort(np.random.choice(df.index[top_property_indices], size=k, replace=False, p=prob))
    return np.array(sorted(rec_indices, key=lambda k: (df_weights[k], k), reverse=True))

In [7]:
def get_top_property_recommendations(row, df, feature_weightage, k=3):

    ######## Remove input from dataframe (To stop recommending the input itself)
    df = df.drop(row.name)
    df_weights = np.maximum(get_recommendation_weights(row, df, feature_weightage), 0)

    df_properties = pd.concat([df, pd.DataFrame(df_weights, columns=['recommendation_score'])], axis=1).sort_values('recommendation_score', ascending=True)
    df_property_groups = df_properties.groupby(['property_name', 'property_type']).agg({'recommendation_score': ['max', 'idxmax']})

    df_weights = df_property_groups[('recommendation_score', 'max')]
    df_index = df_property_groups[('recommendation_score', 'idxmax')]
    ######## Filter Out Top Properties (More Than Requested)
    top_property_indices = df_weights.argsort()[-5*k:]

    # ######## Introduce Randomness in Choice From The Top Recommendations
    prob = df_weights[top_property_indices]
    prob = prob/prob.sum()
    rec_indices = df_index[np.random.choice(df_property_groups.index[top_property_indices], size=k, replace=False, p=prob)]

    return np.array(sorted(rec_indices, key=lambda k: (df_properties['recommendation_score'][k], k), reverse=True))

## Testing the Recommendation Engine

### Pick a Sample Listing as Input

In [8]:
# Pick a row id of choice
row_id = 10
row_id = 20
row_id = 30
row_id = 40
#row_id = 50

# Get the row from the dataframe (an valid row ids will throw an error)
row = df.iloc[row_id]

# Just for printing it nicely, we create a new dataframe from this single row
pd.DataFrame([row])

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,tenure_left,price_per_sqft,price
40,bukit batok / bukit panjang / choa chu kang (d23),segar meadows,hdb,99-year leasehold,2012.0,3.0,2.0,1000,,unspecified,,1.387053,103.772288,fajar,bukit panjang,99.0,False,89.0,577.5,577500.0


### Compute and Display the recommendations

In [9]:
k = 10

recommendation_list = get_top_recommendations(row, df, feature_weightage, k=k)

df.iloc[recommendation_list]

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,tenure_left,price_per_sqft,price
8211,bukit batok / bukit panjang / choa chu kang (d23),fajar hills,hdb,99-year leasehold,2016.0,3.0,2.0,1216,,unspecified,,1.380554,103.770441,fajar,bukit panjang,99.0,False,93.0,629.358553,765300.0
12373,bukit batok / bukit panjang / choa chu kang (d23),segar gardens,hdb,99-year leasehold,2002.0,3.0,2.0,1184,mid,unspecified,,1.387053,103.772288,fajar,bukit panjang,99.0,False,79.0,478.885135,567000.0
5032,bukit batok / bukit panjang / choa chu kang (d23),segar gardens,hdb,99-year leasehold,2003.0,3.0,2.0,1194,,unspecified,,1.387053,103.772288,fajar,bukit panjang,99.0,False,80.0,466.080402,556500.0
12250,bukit batok / bukit panjang / choa chu kang (d23),segar gardens,hdb,99-year leasehold,2002.0,3.0,2.0,1184,mid,unfurnished,,1.387053,103.772288,fajar,bukit panjang,99.0,False,79.0,443.412162,525000.0
16797,bukit batok / bukit panjang / choa chu kang (d23),422 fajar road,hdb,,1988.0,3.0,3.0,1668,low,unspecified,61.0,1.386072,103.771738,fajar,bukit panjang,,False,,558.992806,932400.0
1046,bukit batok / bukit panjang / choa chu kang (d23),segar gardens,hdb,99-year leasehold,2002.0,4.0,2.0,1378,,unspecified,,1.387053,103.772288,fajar,bukit panjang,99.0,False,79.0,495.283019,682500.0
3850,bukit batok / bukit panjang / choa chu kang (d23),453 fajar road,hdb,,1997.0,4.0,2.0,1528,,unspecified,79.0,1.386072,103.771738,fajar,bukit panjang,,False,,480.366492,734000.0
1340,bukit batok / bukit panjang / choa chu kang (d23),segar vale,hdb,99-year leasehold,2015.0,3.0,2.0,990,,unspecified,,1.388544,103.770148,saujana,bukit panjang,99.0,False,92.0,572.727273,567000.0
16004,bukit batok / bukit panjang / choa chu kang (d23),senja gateway,hdb,99-year leasehold,2015.0,3.0,2.0,1001,,unspecified,,1.386661,103.757903,senja,bukit panjang,99.0,False,92.0,595.804196,596400.0
6814,bukit batok / bukit panjang / choa chu kang (d23),pangshan valley,hdb,99-year leasehold,1998.0,3.0,2.0,1302,,unspecified,,1.384866,103.766629,saujana,bukit panjang,99.0,False,75.0,554.83871,722400.0


In [10]:
recommendation_list = get_top_property_recommendations(row, df, feature_weightage, k=k)

df.iloc[recommendation_list]

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,tenure_left,price_per_sqft,price
13706,bukit batok / bukit panjang / choa chu kang (d23),419 fajar road,hdb,,1989.0,4.0,2.0,1302,,unspecified,128.0,1.384261,103.769561,fajar,bukit panjang,,False,,580.645161,756000.0
13576,bukit batok / bukit panjang / choa chu kang (d23),segar palmview,hdb,99-year leasehold,2015.0,3.0,2.0,990,,unspecified,,1.3892,103.767235,saujana,bukit panjang,99.0,False,92.0,560.0,554400.0
13680,bukit batok / bukit panjang / choa chu kang (d23),165 gangsa road,hdb,,1998.0,3.0,2.0,1076,,unspecified,143.0,1.378496,103.766708,jelebu,bukit panjang,,False,,584.572491,629000.0
12585,bukit batok / bukit panjang / choa chu kang (d23),629 senja road,hdb,,2004.0,3.0,2.0,1184,high,partial,233.0,1.38653,103.759182,senja,bukit panjang,,False,,576.435811,682500.0
6862,bukit batok / bukit panjang / choa chu kang (d23),606 senja road,hdb,,1999.0,3.0,2.0,1300,,unspecified,164.0,1.387254,103.759783,senja,bukit panjang,,False,,565.384615,735000.0
10468,bukit batok / bukit panjang / choa chu kang (d23),155 gangsa road,hdb,,1998.0,3.0,2.0,1506,,unspecified,96.0,1.378496,103.766708,jelebu,bukit panjang,,False,,584.262948,879900.0
16393,bukit batok / bukit panjang / choa chu kang (d23),656 senja road,hdb,,2001.0,3.0,2.0,1184,,unspecified,76.0,1.387254,103.759783,senja,bukit panjang,,False,,545.439189,645800.0
15126,bukit batok / bukit panjang / choa chu kang (d23),653 senja link,hdb,,2001.0,3.0,2.0,1399,,unspecified,74.0,1.387654,103.762399,senja,bukit panjang,,False,,540.38599,756000.0
4182,bukit batok / bukit panjang / choa chu kang (d23),627 senja road,hdb,,2004.0,3.0,2.0,1184,,fully,234.0,1.38554,103.759515,senja,bukit panjang,,False,,638.513514,756000.0
14173,bukit batok / bukit panjang / choa chu kang (d23),626 senja road,hdb,,2005.0,3.0,2.0,1184,,unspecified,234.0,1.385485,103.759896,senja,bukit panjang,,False,,654.476351,774900.0
