# Task 2: Recommendation Engine

### Setting up the Notebook

In [1]:
import numpy as np
import pandas as pd

from src.utils import read_csv
from src.data_preprocessor import DataPreprocessor
from src.preprocessor_utils import remove_columns, convert_to_lowercase

from src.recommendation_utils import get_recommendation_weights

In [2]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

### Load the Data

In [3]:
trainX, trainY = read_csv('data/train.csv', ylabel='price')
data_preprocessor = DataPreprocessor()
trainX, trainY = data_preprocessor.fit_transform_for_recommendations(trainX, trainY)

df = pd.concat([trainX, trainY], axis=1)

## Recommendation Setup

### Features for Property Similarity

| Feature | Description | Smallest Value | Largest Value | Continuous |
| ------- | ----------- | - | - | ---------- |
| subzone | Boolean variable to represent if the properties have the same subzone | 0 : Different subzones | 1 : Same subzone | No |
| planning_area | Boolean variable to represent if the properties have the same planning area | 0 : Different planning areas | 0.5 : Same planning area | No |
| price | Similarity score between prices (Gaussian kernel on top of L1 distance) | 0 | 1 : Same price | Yes |
| property_type | Boolean variable to represent if the properties have the same property type | 0 : Different property type | 1 : Same property type | No |
| num_beds | Boolean variable to represent if the properties have the same number of bedrooms | 0 : Different number of bedrooms | 1 : Same number of bedrooms | No |
| size_sqft | Similarity score between property size (Gaussian kernel on top of L1 distance) | 0 | 1 : Same property size | Yes |
| floor_level | Boolean variable to represent if the properties are at the same floor level | 0 : Different floor levels, or floor level of input row is NaN | 1 : Same floor level | No |
| furnishing | Boolean variable to represent if the properties have the same level of furnishing | 0 : Different levels of furnishing, or furnishing of input row is unspecified | 1 : Same level of furnishing | No |
| tenure_left | Similarity score between the number of years left in the tenure lease (Gaussian kernel on top of L1 distance) | 0 | 1 : Same numbers of years left | Yes |

### User Input and Preferences

<img src="images/99co_recommendation.jpg" style="width: 700px;"/>

[99.co](99.co) shows four possible priorities that the user can provide to sort 'similar listings'. We adapt the same in our setup. While the `get_top_recommendations` is capable of handling highly nuanced weightage of each feature, we provide xxx pre-defined settings below. Uncomment any one to continue, or make no changes to see the universal recommendations.

In [4]:
feature_list = ['subzone', 'planning_area', 'price',
                'property_type', 'num_beds', 'size_sqft',
                'floor_level', 'furnishing', 'tenure_left']
feature_weightage = {k:1 for k in feature_list}

########## Prioritize 'nearby' property ##########
# feature_weightage['subzone'] = 10
# feature_weightage['planning_area'] = 10

########## Prioritize units with similar 'price' ##########
# feature_weightage['price'] = 10

########## And a few more ##########

## Computing the Top Recommendations

In [5]:
np.random.seed(hash("99.co")%100)

In [6]:
def get_top_recommendations(row, df, feature_weightage, k=3):

    ######## Remove input from dataframe (To stop recommending the input itself)
    df = df.drop(row.name)
    
    ######## Get Recommendation Weights For Each Property
    df_weights = get_recommendation_weights(row, df, feature_weightage)

    ######## Filter Out Top Properties (More Than Requested)
    top_property_indices = df_weights.argsort()[-5*k:]
    
    ######## Introduce Randomness in Choice From The Top Recommendations
    prob = df_weights[top_property_indices]
    prob = prob/prob.sum()
    rec_indices = np.random.choice(df.index[top_property_indices], size=k, replace=False, p=prob)
    return np.array(rec_indices)

## Testing the Recommendation Engine

### Pick a Sample Listing as Input

In [7]:
# Pick a row id of choice
row_id = 10
#row_id = 20
#row_id = 30
#row_id = 40
#row_id = 50

# Get the row from the dataframe (an valid row ids will throw an error)
row = df.iloc[row_id]

# Just for printing it nicely, we create a new dataframe from this single row
pd.DataFrame([row])
# pd.DataFrame([row])

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,price
10,299 bedok south avenue 3,bedok court,condo,99-year leasehold,1985.0,2.0,,1733,high,unspecified,280.0,1.322153,103.945223,bedok south,bedok,99,False,2205000.0


### Compute and Display the recommendations

In [8]:
k = 3

recommendation_list = get_top_recommendations(row, df, feature_weightage, k=k)

df.iloc[recommendation_list]

Unnamed: 0,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,total_num_units,lat,lng,subzone,planning_area,tenure_duration,is_freehold,price
12579,18 bedok north drive,bedok residences,condo,99-year leasehold,2015.0,2.0,2.0,883,high,partial,583.0,1.321972,103.946825,bedok south,bedok,99,False,1470000.0
7985,2 bedok rise,the glades,condo,99-year leasehold,2017.0,2.0,2.0,840,,unspecified,726.0,1.326567,103.947897,bedok south,bedok,99,False,1312500.0
14424,24 bedok north drive,bedok residences,condo,99-year leasehold,2015.0,2.0,2.0,764,,fully,583.0,1.321972,103.946825,bedok south,bedok,99,False,1312500.0
