**Alison Glazer**
# Airbnb Pricing - Similar Listing Recommender
Create a tool for Airbnb hosts to find listings that are similar to theirs using unsupervised machine learning methods

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Libraries</a></span></li><li><span><a href="#Display-Options" data-toc-modified-id="Display-Options-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Display Options</a></span></li><li><span><a href="#Load-the-Data" data-toc-modified-id="Load-the-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load the Data</a></span></li><li><span><a href="#Nearest-Neighbors" data-toc-modified-id="Nearest-Neighbors-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Nearest Neighbors</a></span></li><li><span><a href="#Singular-Value-Decomposition-(Unfinished)" data-toc-modified-id="Singular-Value-Decomposition-(Unfinished)-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Singular Value Decomposition (Unfinished)</a></span></li></ul></div>

## Import Libraries

In [1]:
# Working with Data
import pandas as pd
import numpy as np

# Saving
import pickle

# Machine Learning
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

from sklearn.decomposition import TruncatedSVD
from scipy.linalg import svd

<a id='display'></a>
## Display Options

In [70]:
pd.options.display.max_columns = 200

## Load the Data

In [4]:
with open('data/lax_X_full.pickle', 'rb') as to_read:
    X_full = pickle.load(to_read)

In [11]:
# Convert price to number
X_full.price = X_full.price.str.replace('$','').str.replace(',','').apply(lambda s: float(s))

In [338]:
# Columns for nearest neighbors
cols_nearest_neigh = [
    'accommodates', 'bathrooms', 'security_deposit', 'cleaning_fee',
    'minimum_nights', 'review_scores_rating', 'availability_365',
    'amen_group_cleaning', 'amen_group_electronics', 'neigh_Downtown',
    'neigh_Bel Air/Beverly Crest', 'neigh_Malibu', 'neigh_Venice',
    'neigh_West Hollywood', 'neigh_West Los Angeles', 'neigh_South LA',
    'neigh_Manhattan Beach', 'neigh_Hollywood', 'neigh_Beverly Hills',
    'neigh_Santa Monica', 'neigh_Marina Del Rey', 'neigh_Pacific Palisades',
    'neigh_Westwood', 'room_Entire home/apt', 'room_Private room',
    'room_Shared room', 'prop_Hotel', 'prop_Bed and breakfast',
    'prop_Camper/RV', 'prop_Guest suite', 'prop_Hostel', 'prop_Apartment',
    'price', 'bedrooms', 'beds'
]

# Other columns to be used in display
flask_cols = [
    'listing_url', 'name',
    'picture_url', 'neighbourhood', 'property_type',
    'room_type'
]

X_test = X_full[cols_nearest_neigh]

In [343]:
# Build df for use in flask app
X_flask = X_full[cols_nearest_neigh+flask_cols]
# Pickle this for later
with open("data/lax_X_flask.pickle", "wb") as f:
    pickle.dump(X_flask, f)

In [348]:
# Scale listings matrix
sim_scaler = StandardScaler()
X_full_scaled = pd.DataFrame(sim_scaler.fit_transform(X_full[cols_nearest_neigh].values),
                              columns=cols_nearest_neigh,index=X_full.index)
# Weigh price more heavily than other features
# X_full_scaled.price = 10000*X_full_scaled.price
# Weigh other prominent features more heavily than other 

In [307]:
# Pickle this for later
with open("sim_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [292]:
X_full_scaled

Unnamed: 0,accommodates,bathrooms,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,availability_365,amen_group_cleaning,amen_group_electronics,neigh_Downtown,neigh_Bel Air/Beverly Crest,neigh_Malibu,neigh_Venice,neigh_West Hollywood,neigh_West Los Angeles,neigh_South LA,neigh_Manhattan Beach,neigh_Hollywood,neigh_Beverly Hills,neigh_Santa Monica,neigh_Marina Del Rey,neigh_Pacific Palisades,neigh_Westwood,room_Entire home/apt,room_Private room,room_Shared room,prop_Hotel,prop_Bed and breakfast,prop_Camper/RV,prop_Guest suite,prop_Hostel,prop_Apartment,price,bedrooms,beds
13,-0.656550,-0.463798,-0.618118,-0.101010,-0.121191,-0.029320,1.537701,-1.051847,1.802394,-0.221384,-0.072347,-0.068823,3.992394,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,0.748819,-0.688120,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,-0.001158,-1.425592,-0.631075
14,-0.656550,0.420255,-0.618118,-0.425343,-0.252589,0.074931,-0.484994,1.456034,0.737295,-0.221384,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,-1.335437,1.453234,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,-0.396279,-0.236874,-0.631075
20,-0.656550,-0.463798,-0.618118,-1.093087,-0.252589,0.596187,1.660755,-1.678818,0.737295,-0.221384,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,-1.335437,1.453234,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,-0.396279,-0.236874,-0.631075
24,-0.656550,-0.463798,0.393115,-1.093087,-0.252589,0.491935,1.091632,-0.424877,1.802394,-0.221384,-0.072347,-0.068823,3.992394,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,0.748819,-0.688120,-0.17472,-0.070355,-0.063457,-0.066194,4.207137,-0.068305,-0.784805,0.241013,-1.425592,-0.631075
27,1.169635,-0.463798,2.415580,0.910145,3.557952,-0.133571,1.299285,-0.424877,1.802394,-0.221384,-0.072347,-0.068823,3.992394,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,0.748819,-0.688120,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,0.687118,-1.425592,-1.427315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44765,0.256542,-0.463798,0.393115,-0.043775,-0.383987,0.596187,-0.523448,-1.051847,-0.327803,-0.221384,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,0.748819,-0.688120,-0.17472,-0.070355,-0.063457,-0.066194,4.207137,-0.068305,-0.784805,-0.294312,-0.236874,0.165164
44822,0.713088,-0.463798,-0.618118,0.147009,-0.383987,0.596187,0.191802,0.202093,-0.327803,4.517036,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,0.748819,-0.688120,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,-0.013904,-0.236874,0.165164
44865,-1.113096,-0.463798,-0.618118,-1.093087,-0.383987,0.596187,1.614610,0.202093,-1.392902,-0.221384,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,-1.335437,1.453234,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,-0.784805,-0.906113,-0.236874,-0.631075
44908,-0.656550,-0.463798,-0.618118,-1.093087,-0.383987,0.283433,0.199493,0.829064,-0.327803,-0.221384,-0.072347,-0.068823,-0.250476,-0.162128,-0.109702,-0.177902,-0.056984,-0.323843,-0.091109,-0.162356,-0.120977,-0.054443,-0.107714,-1.335437,1.453234,-0.17472,-0.070355,-0.063457,-0.066194,-0.237691,-0.068305,1.274203,-0.268820,-0.236874,-0.631075


In [293]:
X_full_scaled_transpose = X_full_scaled.transpose()

## Nearest Neighbors

In [294]:
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(X_full_scaled)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [295]:
neigh.kneighbors([list(X_full_scaled.loc[13].values)])

(array([[0.        , 1.41596515, 1.48003702, 1.67744053, 1.785685  ]]),
 array([[   0,  268, 2895,  682,  553]]))

In [287]:
len(list(X_full_scaled.loc[13].values))

35

In [286]:
# Pickle this for later
with open("neigh.pkl", "wb") as f:
    pickle.dump(neigh, f)

In [285]:
X_full.iloc[[0,268,2895,682,553]][cols_of_interest]

Unnamed: 0,accommodates,bathrooms,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,availability_365,amen_group_cleaning,amen_group_electronics,neigh_Downtown,neigh_Bel Air/Beverly Crest,neigh_Malibu,neigh_Venice,neigh_West Hollywood,neigh_West Los Angeles,neigh_South LA,neigh_Manhattan Beach,neigh_Hollywood,neigh_Beverly Hills,neigh_Santa Monica,neigh_Marina Del Rey,neigh_Pacific Palisades,neigh_Westwood,room_Entire home/apt,room_Private room,room_Shared room,prop_Hotel,prop_Bed and breakfast,prop_Camper/RV,prop_Guest suite,prop_Hostel,prop_Apartment,price,bedrooms,beds
13,2,1.0,0.0,52.0,3,94.0,348,3,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,121.0,0.0,1.0
1462,2,1.0,0.0,49.0,3,96.0,307,4,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,139.0,1.0,1.0
12083,3,1.0,250.0,55.0,1,98.0,312,3,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,117.0,0.0,0.0
3506,3,1.0,125.0,35.0,1,98.0,297,4,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,105.0,1.0,1.0
2890,2,1.0,250.0,35.0,2,98.0,281,5,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,135.0,0.0,1.0


In [302]:
test_input = [
    2.0, 1.0, 100.0, 100.0, 2.0, 97.0, 200.0, 4.68, 2.31, 0.0, 0.0, 0.0, 1.0,
    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,
    0.0, 0.0, 0.0, 0.0, 143.7, 2.0, 2.0
]

In [349]:
# Build a function for use in the flask app

def similar_listings2(sim_input):
    sim_input_scaled = sim_scaler.transform(np.array(sim_input).reshape(1, -1))
    kneighbors = list(neigh.kneighbors([sim_input_scaled[0]])[1][0])
    print(kneighbors)
    sim = X_flask.iloc[kneighbors][[
                     'name', 'picture_url', 'neighbourhood', 'property_type',
                     'bathrooms', 'bedrooms', 'beds', 'price', 'listing_url'
                 ]]
    name = list(sim.name)
    print(name)
    neighbourhood = list(sim.neighbourhood)
    property_type=list(sim.property_type)
    bathrooms = list(sim.bathrooms)
    bedrooms = list(sim.bedrooms)
    beds = list(sim.beds)
    price=list(sim.price)
    listing_url=list(sim.listing_url)
    picture_url=list(sim.picture_url)
    
    return name, neighbourhood, property_type, bathrooms, bedrooms, beds, price, listing_url, picture_url

## Singular Value Decomposition (Unfinished)

In [256]:
from sklearn.decomposition import TruncatedSVD
from scipy.linalg import svd
# lsa_c = TruncatedSVD(20)
# doc_topic_lsa_c = lsa_c.fit_transform(doc_term_mat_count)

U, Sigma, VT = svd(X_full_scaled_transpose)

# Try with sklearn
svd2 = TruncatedSVD(3)
svd2.fit(X_full_scaled)
VT2 = svd2.transform(X_full_scaled)

In [257]:
# Change shape to match the one produced by scipy
VT2 = VT2.T

As a reminder, here's what we're expecting to see:

<img src="http://zwmiller.com/projects/images/svd_breakdown.png">

Great, so now what do we have? In this case **VT** is now a matrix where each column represents one of the items in the new vector space. Each row is one component of the vector space, for the items.

In [258]:
VT = VT[:3,:]
pd.DataFrame(VT)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,...,14112,14113,14114,14115,14116,14117,14118,14119,14120,14121,14122,14123,14124,14125,14126,14127,14128,14129,14130,14131,14132,14133,14134,14135,14136,14137,14138,14139,14140,14141,14142,14143,14144,14145,14146,14147,14148,14149,14150,14151,14152,14153,14154,14155,14156,14157,14158,14159,14160,14161,14162,14163,14164,14165,14166,14167,14168,14169,14170,14171,14172,14173,14174,14175,14176,14177,14178,14179,14180,14181,14182,14183,14184,14185,14186,14187,14188,14189,14190,14191,14192,14193,14194,14195,14196,14197,14198,14199,14200,14201,14202,14203,14204,14205,14206,14207,14208,14209,14210,14211
0,0.001754,0.005415,0.00884,0.002123,-0.006164,0.000702,0.000319,0.00376,0.001421,9.2e-05,0.003065,0.004066,0.000501,0.000198,-0.002217,0.002415,0.001865,0.006376,0.000435,0.003794,0.006443,0.002076,0.0002,-0.005838,0.001679,8.1e-05,0.004267,-0.002608,0.004464,-0.009895,0.00389,-0.01194,0.005658,-0.003644,-0.00917,0.004349,-0.004804,-0.002295,0.003714,0.005487,0.004609,-0.017489,-0.005789,-0.002647,0.0085,-0.00989,-0.015562,0.000733,0.003743,0.001251,-0.002109,-0.007467,-0.0026,0.00142,-0.004097,0.007567,0.000148,0.005435,-0.008221,-0.001045,-0.005394,-0.016151,0.00741,0.007205,-0.008269,-0.001468,0.005217,0.006767,0.000479,0.000728,0.008246,0.00493,0.000241,0.001779,0.004391,-0.010662,-0.002914,0.00418,-0.002778,0.003931,-0.019108,0.00927,-0.011829,0.008129,-0.018065,-0.009976,-0.001466,-0.011999,0.00288,0.000114,-0.006614,0.003135,-0.001051,0.005371,-0.018332,0.003256,0.006322,0.007435,0.004493,-0.004835,...,-0.009422,-0.008287,0.004167,-0.027456,-0.007371,0.009276,-0.011504,0.009799,0.001943,0.001136,0.004196,0.009019,-0.00496,0.007639,0.007692,0.00512,-0.009066,-0.001074,0.004752,-0.02982017,-0.002802,-0.011486,0.008722,0.00802,-0.001308,-0.002686,-0.016575,-0.002689,0.001081,0.000738,-0.002003,0.005638,0.011258,0.007712,0.001536,0.011024,0.006621,0.009711,-0.005812,0.006401,0.001896,-0.001368,-0.00745,-0.004989,0.007393,-0.012458,0.00483,-0.014488,0.000546,0.012195,0.002808,0.009671,-0.001252,0.010711,0.011624,0.000244,-0.000467,0.005674,0.009703,-0.001247,0.00247,-0.000119,0.008247,0.009707,0.009446,0.008434,-0.000605,-0.011265,0.009589,0.008258,0.008255,0.009004,-0.013365,0.011688,-0.002013,-0.002977,0.009041,0.011203,-0.028966,0.003687,0.004037,0.001888,0.009899,0.006074,0.008574,0.009715,0.00977,-0.00309,0.008461,0.006623,0.007409,0.0043,0.005649,0.00141,-0.005143,0.000303,-0.002275,0.01027,0.007813,0.007178
1,0.011026,-0.005529,-0.005169,0.014989,0.018352,0.009514,0.006377,0.009051,0.00842,0.006395,0.002774,0.002647,0.010659,0.008487,0.007003,0.011899,-0.003297,-0.002947,-0.000421,0.002938,0.003706,0.00596,-0.002395,-0.002653,-0.020703,0.00611,0.004779,0.012001,0.004268,0.005617,0.000575,0.005507,0.001128,-0.004487,0.009895,0.006022,0.01619,0.015069,0.005633,0.005415,-0.000251,-0.006447,-0.007209,0.009785,-0.003637,0.007945,0.019018,0.010022,-0.003482,0.011061,0.008101,-0.005205,0.016304,0.002733,0.016436,-0.005002,0.008009,0.008743,0.012442,0.004064,0.019929,0.008455,-0.000905,0.003667,0.008234,0.013517,0.002036,-0.002083,0.008149,0.006374,-0.004015,-0.000291,0.005629,0.008523,0.008186,0.017393,0.010667,0.004979,0.005135,0.000996,-0.002211,-0.004297,0.008325,-0.004907,0.002497,0.006836,0.007767,0.002337,0.0124,0.005247,0.020058,0.008774,0.004839,-0.000589,0.011836,0.012542,1.2e-05,-0.000591,-0.027489,0.018383,...,-0.003795,0.010647,0.005815,-0.021893,0.000924,-0.006712,-0.004285,-0.005746,0.006153,0.00414,-0.020452,-0.010144,0.004167,-0.013264,-0.013068,-0.014925,0.007802,-0.002987,0.011715,-0.02854313,-0.0005,-0.003846,-0.00707,-0.006971,-0.026228,-0.022978,-0.008834,0.005447,0.005597,0.005429,0.006595,-0.001513,-0.005041,0.001514,-0.016464,-0.011224,-0.008002,-0.006287,0.007766,-0.007548,0.006048,0.001342,0.002934,-0.003673,-0.009672,-0.012378,-0.005787,-0.010544,0.009417,-0.002638,0.004117,-0.018864,-0.001587,-0.005466,-0.003539,0.005855,0.006926,-0.000656,-0.008467,0.013915,0.003356,0.005346,-0.003861,-0.006284,-0.004246,-0.00135,0.006104,-0.009233,-0.011362,-0.007731,0.006976,-0.00272,-0.003584,-0.008338,0.004058,0.007633,-0.009416,-0.006928,-0.030963,0.007749,0.002844,0.004148,-0.021515,-0.006321,-0.016229,-0.018861,-0.018852,0.004335,-0.011122,-0.003226,-0.007922,-0.00196,-0.002081,0.009676,0.000431,0.004814,0.002847,-0.006862,-0.002165,0.00027
2,-0.003426,-0.008293,-0.008431,-0.007356,-0.019808,-0.003179,-0.001764,-0.026122,-0.011105,-0.004664,-0.016708,-0.01628,-0.001623,-0.003953,-0.006542,-0.005919,-0.013559,-0.01619,0.002703,0.003182,-0.021725,-0.000544,-0.015535,0.001483,-0.004431,-0.001492,0.008821,-0.017251,-0.016956,-0.009545,-0.0229,-0.010931,-0.00826,-0.011261,-0.011953,0.001459,-0.02564,-0.011743,0.00144,-0.007326,-0.012984,-0.008853,0.004117,-0.00666,-0.006749,0.001554,-0.030199,-0.003532,-0.018068,0.006708,-0.00139,0.002555,-0.01214,0.001555,-0.010549,-0.007972,0.005475,-0.022802,-0.021585,0.005512,-0.024269,-0.024369,-0.018853,-0.008967,-0.012572,-0.018431,-0.019649,-0.006775,-0.002851,-0.000987,-0.008297,-0.020855,-0.004285,0.003188,0.004528,-0.015376,-0.00295,0.008357,-0.005795,-0.020413,-0.018659,0.009642,-0.006956,-0.00541,-0.020234,6.3e-05,-0.003084,0.004207,0.000205,-0.002616,-0.020562,0.0059,-0.005215,-0.019449,-0.030134,-0.007532,-0.004915,-0.013344,0.000594,-0.023071,...,0.008981,-0.012829,0.002047,-0.003056,-0.000873,-0.002641,-0.002378,-0.003928,0.008862,0.008074,0.001535,-0.003237,0.004369,0.00137,0.001329,0.001553,-0.008115,0.011001,0.014219,5.577549e-07,-6.9e-05,0.004275,-0.001794,-0.003253,0.004782,0.002728,-0.002231,0.00124,0.002572,0.004301,0.007148,-0.006397,0.003231,0.002552,-0.007776,0.009755,-0.002393,0.001945,0.003604,-0.009648,0.004856,0.001794,0.008389,0.008848,-0.006484,0.000598,-0.005928,-0.005085,0.011223,0.003516,0.007662,0.003147,0.004727,0.000736,-0.005731,0.009871,0.003429,0.02286,0.000709,-0.007026,0.008463,-0.003388,0.00988,-0.002177,-0.009528,0.002693,0.011958,0.010139,-0.000533,-0.006273,0.008723,0.000239,0.001761,0.016149,0.011475,-0.002434,-0.001758,0.002585,0.002125,0.009724,0.005407,0.002678,0.022601,-0.003218,0.015655,0.003197,0.003323,0.006385,-0.003056,-0.001997,0.007272,-0.002891,0.005752,0.010537,0.008198,0.000849,0.008177,-0.006881,-0.001478,0.003989


If I transpose this, the rows are items, and the columns are the items in the "hidden" vector space created by the truncated SVD.

In [259]:
pd.DataFrame(VT.T)

Unnamed: 0,0,1,2
0,0.001754,0.011026,-0.003426
1,0.005415,-0.005529,-0.008293
2,0.008840,-0.005169,-0.008431
3,0.002123,0.014989,-0.007356
4,-0.006164,0.018352,-0.019808
...,...,...,...
14207,0.000303,0.004814,0.000849
14208,-0.002275,0.002847,0.008177
14209,0.010270,-0.006862,-0.006881
14210,0.007813,-0.002165,-0.001478


**U** is a matrix where each row is a user and each column shows the location in the hidden vector space created by the SVD.

In [260]:
U = U[:,:3]
pd.DataFrame(U)

Unnamed: 0,0,1,2
0,-0.382704,-0.193633,0.118014
1,-0.243297,-0.389244,-0.030637
2,-0.211466,0.132872,-0.275425
3,-0.366317,0.10614,-0.113216
4,-0.06723,0.214326,-0.434427
5,-0.007838,0.100197,-0.26463
6,-0.063504,-0.023919,-0.246726
7,-0.180397,0.068298,-0.173619
8,-0.156437,0.218678,-0.25669
9,-0.030431,0.07465,0.174739


**Sigma** is just the singular values of the decomposition. In this case, we're not particularly interested in **Sigma**.

In [261]:
Sigma = Sigma[:3]
pd.DataFrame(np.diag(Sigma))

Unnamed: 0,0,1,2
0,258.340727,0.0,0.0
1,0.0,166.976909,0.0
2,0.0,0.0,148.265397


In [262]:
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# %matplotlib inline
# plt.style.use('seaborn')

# fig = plt.figure(figsize=(20,16))
# ax = fig.gca(projection='3d')
# ax.scatter(U[:,0],U[:,1],U[:,2],c='k',s=150);
# ax.set_xlabel("D1", fontsize=20, labelpad=20)
# ax.set_ylabel("D2", fontsize=20, labelpad=20)
# ax.set_zlabel("D3", fontsize=20, labelpad=20);

# lbls = X_full_scaled_transpose.index
# offset = 0.01
# for i, txt in enumerate(lbls):
#     if i not in [6,7]:
#         ax.text(U[i,0]+offset,U[i,1],U[i,2],txt, fontsize=20)
#     else:
#         ax.text(U[i,0]+offset,U[i,1],U[i,2]+5*offset,txt, fontsize=20)


In [263]:
# fig = plt.figure(figsize=(20,16))
# ax = fig.gca(projection='3d')
# ax.scatter(VT.T[:,0],VT.T[:,1],VT.T[:,2],c='b',s=150, label="Items");
# ax.set_xlabel("D1", fontsize=20, labelpad=20)
# ax.set_ylabel("D2", fontsize=20, labelpad=20)
# ax.set_zlabel("D3", fontsize=20, labelpad=20);

# lbls = X_full_scaled_transpose.columns
# item_offset = 0.01
# for i, txt in enumerate(lbls):
#     if i not in [6,7]:
#         ax.text(VT.T[i,0],VT.T[i,1]+item_offset,VT.T[i,2],txt, fontsize=20)
#     else:
#         ax.text(VT.T[i,0],VT.T[i,1]+item_offset,VT.T[i,2]+5*item_offset,txt, fontsize=20)

# ax.view_init(30,15)
# plt.legend(loc="upper left", fontsize=30);

So, if we look in our new hidden vector space and take the dot products of items (cosine similarity!), we expect that items 2 & 3 are the most similar. 

In [264]:
X_full_scaled_transpose.shape

(35, 14212)

In [265]:
# num_items = X_full_scaled_transpose.shape[1]
# num_users = X_full_scaled_transpose.shape[0]
# compare_item = 2
# for item in range(num_items):
#     if item != compare_item:
#         print("Item %s & %s: "%(compare_item,item), np.dot(VT.T[compare_item],VT.T[item]))

If we compare users, we expect that users 6 & 8 should be the most similar.

In [266]:
# compare_user = 6
# for user in range(num_users):
#     #if user != compare_user:
#         print("User %s & %s: "%(compare_user,user), np.dot(U[compare_user],U[user]))

Let's make a function that returns recommendations for a given item input (this user likes item 0... so she'll probably also like items X, Y, Z).

In [267]:
def get_recommends(itemID, VT, num_recom=2):
    recs = []
    for item in range(VT.T.shape[0]):
        if item != itemID:
            recs.append([item,np.dot(VT.T[itemID],VT.T[item])])
    final_rec = [i[0] for i in sorted(recs,key=lambda x: x[1],reverse=True)]
    return final_rec[:num_recom]

X_full.iloc[get_recommends(2,VT,num_recom=2)]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,...,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,est_bookings,est_booked_nights_per_year,est_booked_nights,occupancy_rate,occupancy_rate2,transit_bin,house_rules_bin,host_response_time_num,cancellation_policy_num,amenities_num,prop_Apartment,prop_Bed and breakfast,prop_Bungalow,prop_Camper/RV,prop_Condominium,prop_Cottage,prop_Guest suite,prop_Guesthouse,prop_Hostel,prop_Hotel,prop_House,prop_Loft,prop_Serviced apartment,prop_Townhouse,prop_Villa,room_type_num,room_Entire home/apt,room_Private room,room_Shared room,bed_Airbed,bed_Couch,bed_Futon,bed_Pull-out Sofa,neighbourhood_compressed,neigh_Alhambra,neigh_Altadena,neigh_Arcadia,neigh_Bel Air/Beverly Crest,neigh_Beverly Hills,neigh_Brentwood,neigh_Burbank,neigh_Culver City,neigh_Del Rey,neigh_Downtown,neigh_Eagle Rock,neigh_East Hollywood,neigh_Echo Park,neigh_Encino,neigh_Glendale,neigh_Hawthorne,neigh_Highland Park,neigh_Hollywood,neigh_Hollywood Hills,neigh_Inglewood,neigh_Long Beach,neigh_Malibu,neigh_Manhattan Beach,neigh_Mar Vista,neigh_Marina Del Rey,neigh_Mid-City,neigh_Mid-Wilshire,neigh_Monterey Park,neigh_North Hollywood,neigh_Pacific Palisades,neigh_Palms,neigh_Pasadena,neigh_Redondo Beach,neigh_Santa Monica,neigh_Sherman Oaks,neigh_Silver Lake,neigh_South LA,neigh_South Robertson,neigh_Studio City,neigh_Temple City,neigh_Topanga,neigh_Torrance,neigh_Valley Glen,neigh_Van Nuys,neigh_Venice,neigh_West Adams,neigh_West Hollywood,neigh_West Los Angeles,neigh_Westchester/Playa Del Rey,neigh_Westlake,neigh_Westside,neigh_Westwood,neigh_Woodland Hills/Warner Center,amen_group_kitchen,amen_group_cleaning,amen_group_safety,amen_group_household,amen_group_bedroom,amen_group_electronics,amen_group_extra_spaces,host_since_num
16158,18714110,https://www.airbnb.com/rooms/18714110,20190914032935,2019-09-14,"""Cape Cod"" in the Trees",Our home is a 10 minute walk to the Pacific Oc...,Three bedrooms in the house are available with...,Our home is a 10 minute walk to the Pacific Oc...,none,Manhattan Beach is a seaside gem,Enjoy your stay!,Cars are King in LA but more mass transit opti...,"Guests have access to kitchen, backyard deck, ...",We are hospitable but respect our guests priva...,,,,https://a0.muscache.com/im/pictures/394f3aa6-0...,,593447,https://www.airbnb.com/users/show/593447,Arthur,2011-05-16,"Manhattan Beach, California, United States",,within an hour,100.0,,0,https://a0.muscache.com/im/users/593447/profil...,https://a0.muscache.com/im/users/593447/profil...,Manhattan Beach,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,"Manhattan Beach, CA, United States",Manhattan Beach,Manhattan Beach,,Manhattan Beach,CA,90266,Los Angeles,"Manhattan Beach, CA",US,United States,33.89409,-118.40745,t,House,Private room,1,1.0,1.0,1.0,Real Bed,"[TV, Cable TV, Wifi, Air conditioning, Kitchen...",,125.0,,,0.0,25.0,1,$0.00,30,90,30,30,90,90,30.0,90.0,7 months ago,t,3,33,63,338,2019-09-14,2,2,2019-02-02,2019-05-28,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,,1,f,strict_14_with_grace_period,0,...,2,0,2,0,0.27,4,194.4,120,1.034483,0.573451,1,0,1,4.0,28,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,1,0,0,0,0,0,Manhattan Beach,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,5,8,1,3,0,3122.0
26099,26060979,https://www.airbnb.com/rooms/26060979,20190914032935,2019-09-14,Relaxing Quiet Home,This room is a fresh addition to my home. It ...,,This room is a fresh addition to my home. It ...,none,,,,"Living room, kitchen and patio",,"No smoking, drinking or extra over night visit...",,,https://a0.muscache.com/im/pictures/31ee7369-9...,,32089619,https://www.airbnb.com/users/show/32089619,Tessie,2015-04-27,"Manhattan Beach, California, United States",I am very friendly and active. I love to prac...,within an hour,100.0,,1,https://a0.muscache.com/im/pictures/user/947f9...,https://a0.muscache.com/im/pictures/user/947f9...,Manhattan Beach,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"Manhattan Beach, CA, United States",Manhattan Beach,Manhattan Beach,,Manhattan Beach,CA,90266,Los Angeles,"Manhattan Beach, CA",US,United States,33.87885,-118.38116,f,House,Private room,2,1.0,1.0,1.0,Real Bed,"[TV, Cable TV, Wifi, Air conditioning, Kitchen...",,75.0,,,0.0,35.0,1,$15.00,30,1125,30,30,1125,1125,30.0,1125.0,6 weeks ago,t,12,42,72,347,2019-09-14,5,5,2018-11-30,2019-08-03,92.0,10.0,10.0,10.0,8.0,10.0,10.0,f,,,1,f,flexible,0,...,2,0,2,0,0.52,10,374.4,300,1.214575,1.075862,0,1,1,1.0,23,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,1,0,0,0,0,0,Manhattan Beach,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,4,7,3,3,0,1680.0


In [268]:
get_recommends(2100,VT,num_recom=5)

[12419, 12944, 1272, 10714, 12283]

In [229]:
X_full[cols_of_interest].sample()

Unnamed: 0,accommodates,bathrooms,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,availability_365,amen_group_cleaning,amen_group_electronics,neigh_Downtown,neigh_Bel Air/Beverly Crest,neigh_Malibu,neigh_Venice,neigh_West Hollywood,neigh_West Los Angeles,neigh_South LA,neigh_Manhattan Beach,neigh_Hollywood,neigh_Beverly Hills,neigh_Santa Monica,neigh_Marina Del Rey,neigh_Pacific Palisades,neigh_Westwood,room_Entire home/apt,room_Private room,room_Shared room,prop_Hotel,prop_Bed and breakfast,prop_Camper/RV,prop_Guest suite,prop_Hostel,prop_Apartment,price,bedrooms,beds
8541,2,1.0,0.0,10.0,1,91.0,75,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,45.0,1.0,1.0


In [232]:
X_full.index.get_loc(8541)

1933

In [269]:
X_full.iloc[[2100,12419, 12944, 1272, 10714, 12283]][cols_of_interest]

Unnamed: 0,accommodates,bathrooms,security_deposit,cleaning_fee,minimum_nights,review_scores_rating,availability_365,amen_group_cleaning,amen_group_electronics,neigh_Downtown,neigh_Bel Air/Beverly Crest,neigh_Malibu,neigh_Venice,neigh_West Hollywood,neigh_West Los Angeles,neigh_South LA,neigh_Manhattan Beach,neigh_Hollywood,neigh_Beverly Hills,neigh_Santa Monica,neigh_Marina Del Rey,neigh_Pacific Palisades,neigh_Westwood,room_Entire home/apt,room_Private room,room_Shared room,prop_Hotel,prop_Bed and breakfast,prop_Camper/RV,prop_Guest suite,prop_Hostel,prop_Apartment,price,bedrooms,beds
9222,4,1.0,350.0,90.0,1,100.0,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,120.0,2.0,2.0
40102,12,2.5,1000.0,450.0,1,100.0,359,7,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,474.0,5.0,5.0
41152,16,2.5,350.0,350.0,1,60.0,334,6,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,499.0,6.0,8.0
6072,12,2.0,500.0,300.0,31,89.0,288,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,490.0,4.0,7.0
36156,12,4.5,1500.0,299.0,1,99.0,49,7,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,349.0,5.0,5.0
39788,16,3.0,200.0,295.0,2,100.0,330,5,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,385.0,6.0,8.0


The NearestNeighbors approach works well - we will proceed with that