# CHATBOT & RECOMMENDATION SYSTEMS
## ***FINAL PROJECT - AirBNB recommendation chatbot***
*LEDUC Bastien, GABISON Yoan*

In [159]:
import numpy as np
import pandas as pd

### Import data

In [160]:
file_path = 'data/New_York_Airbnb_4_dec_2021_cleaned_with_rating_and_images.csv'
data = pd.read_csv(file_path,sep=',',header=0)

In [161]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,rating,images
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,150,30,48,2019-11-04,0.33,3,338,0,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...
1,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Bedford-Stuyvesant,40.68494,-73.95765,Entire home/apt,75,1,409,2021-10-22,4.86,1,194,32,,
2,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,60,30,50,2016-06-05,0.52,2,365,0,4.52,https://a0.muscache.com/im/pictures/2090980c-b...
3,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.66265,-73.99454,Entire home/apt,275,5,2,2021-08-08,0.02,1,123,1,,
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76457,-73.98317,Private room,68,2,507,2021-11-08,3.68,1,192,33,4.22,https://a0.muscache.com/im/pictures/12065/f070...


In [162]:
data.shape

(28747, 19)

In [163]:
data.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'rating', 'images'],
      dtype='object')

### Data cleaning

In [164]:
def drop_cols(dataset,cols_to_drop):
    dataset.drop(cols_to_drop,axis=1,inplace=True)    

In [165]:
clean_data=data.copy()
cols_to_drop = ['latitude','longitude','number_of_reviews','last_review','number_of_reviews_ltm','calculated_host_listings_count','reviews_per_month'] #Add the cols you wish to drop
drop_cols(clean_data,cols_to_drop)

In [166]:
col_nan=[col for col in clean_data.columns if clean_data[col].isna().sum()>0]
col_nan

['rating', 'images']

In [167]:
clean_data.dropna(axis=0,subset=col_nan,inplace=True)

In [168]:
clean_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,availability_365,rating,images
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,Entire home/apt,150,30,338,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...
2,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,Private room,60,30,365,4.52,https://a0.muscache.com/im/pictures/2090980c-b...
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,Private room,68,2,192,4.22,https://a0.muscache.com/im/pictures/12065/f070...
5,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,Private room,75,2,0,4.91,https://a0.muscache.com/im/pictures/103776/b37...
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,Private room,98,4,322,4.7,https://a0.muscache.com/im/pictures/2884180/f1...


In [169]:
clean_data.shape

(20457, 12)

In [170]:
clean_data.isnull().values.any()

False

### One-Hot encode some features

In [171]:
clean_data_to_encode = clean_data.copy()

In [172]:
def add_range_feature(df,col_source,col_dest,range_values):
    for index, row in df.iterrows():
        for rv in range_values:
            b_inf,b_sup = int(rv.split(sep='-')[0]),int(rv.split(sep='-')[1])
            if b_inf <= row[col_source] and row[col_source] <= b_sup:
                df.at[index,col_dest] = rv
                break

#### Add a price range feature

In [173]:
price_range = [f'{i}-{i+49}' for i in range(0,700,50)] + [f'{i}-{i+99}' for i in range(700,1000,100)] + [f'{i}-{i+499}' for i in range(1000,2500,500)] +['2500-10000']
price_range

['0-49',
 '50-99',
 '100-149',
 '150-199',
 '200-249',
 '250-299',
 '300-349',
 '350-399',
 '400-449',
 '450-499',
 '500-549',
 '550-599',
 '600-649',
 '650-699',
 '700-799',
 '800-899',
 '900-999',
 '1000-1499',
 '1500-1999',
 '2000-2499',
 '2500-10000']

In [174]:
# Adding the price range to the dataset
add_range_feature(clean_data_to_encode,col_source='price',col_dest='price_range',range_values=price_range)

#### Add a minimum_nights_range feature

In [175]:
min_nights_range = [f'{i}-{i+9}' for i in range(1,100,10)] + [f'{i}-{i+49}' for i in range(101,400,50)] + ['401-1124']
min_nights_range

['1-10',
 '11-20',
 '21-30',
 '31-40',
 '41-50',
 '51-60',
 '61-70',
 '71-80',
 '81-90',
 '91-100',
 '101-150',
 '151-200',
 '201-250',
 '251-300',
 '301-350',
 '351-400',
 '401-1124']

In [176]:
add_range_feature(clean_data_to_encode,col_source='minimum_nights',col_dest='min_nights_range',range_values=min_nights_range)

#### Add an availability_365_range feature

In [177]:
availability_365_range = [f'{i}-{i+29}' for i in range(1,330,30)] + ['331-365']
availability_365_range

['1-30',
 '31-60',
 '61-90',
 '91-120',
 '121-150',
 '151-180',
 '181-210',
 '211-240',
 '241-270',
 '271-300',
 '301-330',
 '331-365']

In [178]:
add_range_feature(clean_data_to_encode,col_source='availability_365',col_dest='availability_365_range',range_values=availability_365_range)

In [179]:
cols_to_drop = ['minimum_nights','availability_365']
drop_cols(clean_data_to_encode,cols_to_drop)

#### *neighbourhood_group, neighbourhood, room_type, price_range, min_nights_range, availability_365_range*

In [180]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res 

In [181]:
features_to_encode = ['neighbourhood_group','room_type','neighbourhood','price_range','min_nights_range','availability_365_range']
clean_data_encoded = clean_data_to_encode.copy()
for feature in features_to_encode:
    clean_data_encoded = encode_and_bind(clean_data_encoded, feature)

In [182]:
clean_data_encoded.head()

Unnamed: 0,id,name,host_id,host_name,price,rating,images,Bronx,Brooklyn,Manhattan,...,151-180,181-210,211-240,241-270,271-300,301-330,31-60,331-365,61-90,91-120
0,2595,Skylit Midtown Castle,2845,Jennifer,150,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,5121,BlissArtsSpace!,7356,Garon,60,4.52,https://a0.muscache.com/im/pictures/2090980c-b...,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,5178,Large Furnished Room Near B'way,8967,Shunichi,68,4.22,https://a0.muscache.com/im/pictures/12065/f070...,0,0,1,...,0,1,0,0,0,0,0,0,0,0
5,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,75,4.91,https://a0.muscache.com/im/pictures/103776/b37...,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,98,4.7,https://a0.muscache.com/im/pictures/2884180/f1...,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [183]:
for col in clean_data_encoded.columns[7:]:
    clean_data_encoded[col] = clean_data_encoded[col] * clean_data_encoded.rating /5
clean_data_encoded[clean_data_encoded.columns[7:]]

Unnamed: 0,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Entire home/apt,Hotel room,Private room,Shared room,Allerton,...,151-180,181-210,211-240,241-270,271-300,301-330,31-60,331-365,61-90,91-120
0,0.0,0.000,0.940,0.0,0.0,0.94,0.0,0.000,0.0,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.940,0.0,0.0
2,0.0,0.904,0.000,0.0,0.0,0.00,0.0,0.904,0.0,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.904,0.0,0.0
4,0.0,0.000,0.844,0.0,0.0,0.00,0.0,0.844,0.0,0.0,...,0.000,0.844,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
5,0.0,0.000,0.982,0.0,0.0,0.00,0.0,0.982,0.0,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
6,0.0,0.940,0.000,0.0,0.0,0.00,0.0,0.940,0.0,0.0,...,0.000,0.000,0.0,0.0,0.0,0.94,0.0,0.000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28738,0.0,0.836,0.000,0.0,0.0,0.00,0.0,0.836,0.0,0.0,...,0.836,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28739,0.0,0.866,0.000,0.0,0.0,0.00,0.0,0.866,0.0,0.0,...,0.866,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28741,0.0,0.000,0.920,0.0,0.0,0.00,0.0,0.920,0.0,0.0,...,0.920,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28742,0.0,0.000,0.000,1.0,0.0,0.00,0.0,1.000,0.0,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,1.000,0.0,0.0


In [184]:
def cosine_similarity(user: list, room: list) -> float:
    """
    Calculates the cosine similarity between two users.
    :param user:
    :param room:
    :return: the cosine similarity
    """    
    if len(user) != len(room):
        raise ValueError('Vectors must have the same length')
    return np.abs(np.dot(user, room) / (np.linalg.norm(user) * np.linalg.norm(room)))

In [185]:
assert round(cosine_similarity([2/3, 0, 0, 5/3, -7/3, 0, 0], [1/3, 1/3, -2/3, 0, 0, 0, 0]), 3) == 0.092, 'cosine_similarity is incorrect'
assert round(cosine_similarity([2/3, 0, 0, 5/3, -7/3, 0, 0], [0, 0, 0, -5/3, 1/3, 4/3, 0]), 3) == 0.559, 'cosine_similarity is incorrect'

In [186]:
def find_similar_users(user_id: int, n: int = 5) -> list:
    """
    Finds the n most similar users to the user with the given user_id.
    :param user_id: the user_id
    :param normalised_utility_matrix: the normalised utility matrix
    :param n: the number of similar users to find
    :return: the n most similar users to the user with the given user_id
    """
    cols = clean_data_encoded.columns[7:]
    user_vector = list(clean_data_encoded.loc[clean_data_encoded.id==user_id][cols].values[0])
    similarity_scores = []
    for index, row in clean_data_encoded.iterrows():
        room_id = row.id
        room_vector = list(clean_data_encoded.loc[clean_data_encoded.id==room_id][cols].values[0])
        similarity_scores.append((room_id, cosine_similarity(user_vector, room_vector)))
    similarity_scores = [x for x in similarity_scores if not np.isnan(x[1])]
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    return similarity_scores[:n]

In [200]:
s = find_similar_users(2595,20)
s

  return np.abs(np.dot(user, room) / (np.linalg.norm(user) * np.linalg.norm(room)))


In [199]:
cols = clean_data_encoded.columns[7:]
id = [13526398,16378007,24657328,30806012,42038854,42655702,42835727]

sum(clean_data_encoded.loc[clean_data_encoded.id==13526398][cols].values[0]==clean_data_encoded.loc[clean_data_encoded.id==42835727][cols].values[0])/clean_data_encoded[clean_data_encoded.columns[7:]].shape[1]

0.978494623655914