# **CHATBOT & RECOMMENDATION SYSTEMS**
## ***FINAL PROJECT - AirBNB recommendation chatbot***
*LEDUC Bastien, GABISON Yoan*

In [29]:
import pandas as pd

### **Import data**

In [30]:
file_path = '../data/New_York_Airbnb_4_dec_2021_cleaned_with_rating_and_images.csv'
df = pd.read_csv(file_path, sep=',', header=0)

In [31]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,rating,images
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,150,30,48,2019-11-04,0.33,3,338,0,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...
1,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Bedford-Stuyvesant,40.68494,-73.95765,Entire home/apt,75,1,409,2021-10-22,4.86,1,194,32,,
2,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,60,30,50,2016-06-05,0.52,2,365,0,4.52,https://a0.muscache.com/im/pictures/2090980c-b...
3,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.66265,-73.99454,Entire home/apt,275,5,2,2021-08-08,0.02,1,123,1,,
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76457,-73.98317,Private room,68,2,507,2021-11-08,3.68,1,192,33,4.22,https://a0.muscache.com/im/pictures/12065/f070...


In [32]:
df.shape

(28747, 19)

In [33]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'rating', 'images'],
      dtype='object')

### **Data cleaning**

In [34]:
# Will drop columns of the dataset
def drop_cols(dataset: pd.DataFrame, columns_to_drop: list) -> None:
    dataset.drop(columns_to_drop, axis=1, inplace=True)

In [35]:
# We remove the irrelevant columns from the dataset for our recommendation system
# clean_data: pd.DataFrame = df.copy()
#Add the cols you wish to drop
cols_to_drop: list = ['host_id', 'host_name', 'latitude', 'longitude', 'number_of_reviews', 'last_review',
                'number_of_reviews_ltm', 'calculated_host_listings_count', 'reviews_per_month']
drop_cols(dataset=df, columns_to_drop=cols_to_drop)

In [36]:
col_nan: list = [col for col in df.columns if df[col].isna().sum() > 0]
col_nan

['rating', 'images']

In [37]:
# We drop all rows with missing values
df.dropna(axis=0, subset=col_nan, inplace=True)

In [38]:
df.head()

Unnamed: 0,id,name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,availability_365,rating,images
0,2595,Skylit Midtown Castle,Manhattan,Midtown,Entire home/apt,150,30,338,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...
2,5121,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,Private room,60,30,365,4.52,https://a0.muscache.com/im/pictures/2090980c-b...
4,5178,Large Furnished Room Near B'way,Manhattan,Midtown,Private room,68,2,192,4.22,https://a0.muscache.com/im/pictures/12065/f070...
5,5203,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,Private room,75,2,0,4.91,https://a0.muscache.com/im/pictures/103776/b37...
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",Brooklyn,South Slope,Private room,98,4,322,4.7,https://a0.muscache.com/im/pictures/2884180/f1...


In [39]:
df.shape

(20457, 10)

In [40]:
# The cleaned dataset has no more null values
df.isnull().values.any()

False

### **Adding categorical features from numerical features**

In order to use numerical features into our recommendation system, we need to turn them into categorical features. To do so, we created categorical-valued columns that will contains ranges based on the numerical values (eg: a room priced at 120$ will be in the 100-149$ price category).

In [41]:
# clean_data_to_encode: pd.DataFrame = clean_data.copy()

In [42]:
def add_range_feature(dataset: pd.DataFrame, col_source: str, col_dest: str, range_values: list) -> None:
    """
    Creates categorical columns from numerical ones.
    :param dataset: the dataframe to transform
    :param col_source: the numerical column to transform
    :param col_dest: the new created categorical column
    :param range_values: the ranges of the categorical values
    :return: nothing but the transformed dataframe
    """
    for index, row in dataset.iterrows():
        for rv in range_values:
            b_inf, b_sup = int(rv.split(sep='-')[0]), int(rv.split(sep='-')[1])
            if b_inf <= row[col_source] <= b_sup:
                df.at[index, col_dest] = rv
                break

#### Add a price_range feature

In [43]:
price_range = [f'{i}-{i + 49}' for i in range(0, 700, 50)] + [f'{i}-{i + 99}' for i in range(700, 1000, 100)] + [
    f'{i}-{i + 499}' for i in range(1000, 2500, 500)] + ['2500-10000']
price_range

['0-49',
 '50-99',
 '100-149',
 '150-199',
 '200-249',
 '250-299',
 '300-349',
 '350-399',
 '400-449',
 '450-499',
 '500-549',
 '550-599',
 '600-649',
 '650-699',
 '700-799',
 '800-899',
 '900-999',
 '1000-1499',
 '1500-1999',
 '2000-2499',
 '2500-10000']

In [44]:
# Adding the price_range to the dataset
add_range_feature(df, col_source='price', col_dest='price_range', range_values=price_range)

#### Add a minimum_nights_range feature

In [45]:
min_nights_range = [f'{i}-{i + 9}' for i in range(1, 100, 10)] + [f'{i}-{i + 49}' for i in range(101, 400, 50)] + [
    '401-1124']
min_nights_range

['1-10',
 '11-20',
 '21-30',
 '31-40',
 '41-50',
 '51-60',
 '61-70',
 '71-80',
 '81-90',
 '91-100',
 '101-150',
 '151-200',
 '201-250',
 '251-300',
 '301-350',
 '351-400',
 '401-1124']

In [46]:
# Adding the minimum_nights_range to the dataset
add_range_feature(df, col_source='minimum_nights', col_dest='min_nights_range',
                  range_values=min_nights_range)

#### Add an availability_365_range feature

In [47]:
availability_365_range = [f'{i}-{i + 29}' for i in range(1, 330, 30)] + ['331-365']
availability_365_range

['1-30',
 '31-60',
 '61-90',
 '91-120',
 '121-150',
 '151-180',
 '181-210',
 '211-240',
 '241-270',
 '271-300',
 '301-330',
 '331-365']

In [48]:
# Adding the availability_365_range to the dataset
add_range_feature(df, col_source='availability_365', col_dest='availability_365_range',
                  range_values=availability_365_range)

In [49]:
# We keep the real price of the room and the minimum nights for filtering but we remove the availability_365 we categorized.
cols_to_drop = ['availability_365']
drop_cols(df, cols_to_drop)

In [50]:
df.head()

Unnamed: 0,id,name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,rating,images,price_range,min_nights_range,availability_365_range
0,2595,Skylit Midtown Castle,Manhattan,Midtown,Entire home/apt,150,30,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...,150-199,21-30,331-365
2,5121,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,Private room,60,30,4.52,https://a0.muscache.com/im/pictures/2090980c-b...,50-99,21-30,331-365
4,5178,Large Furnished Room Near B'way,Manhattan,Midtown,Private room,68,2,4.22,https://a0.muscache.com/im/pictures/12065/f070...,50-99,1-10,181-210
5,5203,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,Private room,75,2,4.91,https://a0.muscache.com/im/pictures/103776/b37...,50-99,1-10,
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",Brooklyn,South Slope,Private room,98,4,4.7,https://a0.muscache.com/im/pictures/2884180/f1...,50-99,1-10,301-330


### **One-hot encode categorical features**
#### *neighbourhood_group, neighbourhood, room_type, price_range, min_nights_range, availability_365_range*

In [51]:
def encode_and_bind(original_dataframe: pd.DataFrame, feature_to_encode: str) -> pd.DataFrame:
    """
    One-hot encode the selected categorical features from the dataset.
    :param original_dataframe: dataframe
    :param feature_to_encode: list of all the features to encode
    :return: the new dataframe resulting from the one-hot encoding with the features to encode deleted from the dataframe
    """
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    df: pd.DataFrame = pd.concat([original_dataframe, dummies], axis=1)
    df = df.drop([feature_to_encode], axis=1)
    return df

In [52]:
features_to_encode = ['neighbourhood_group', 'room_type', 'neighbourhood', 'price_range', 'min_nights_range',
                      'availability_365_range']
# room_profiles = clean_data_to_encode.copy()
for feature in features_to_encode:
    df = encode_and_bind(df, feature)

In [53]:
# We obtain here the room profiles we are going to use for our recommendation and for creating our user profile.
df.head()

Unnamed: 0,id,name,price,minimum_nights,rating,images,Bronx,Brooklyn,Manhattan,Queens,...,151-180,181-210,211-240,241-270,271-300,301-330,31-60,331-365,61-90,91-120
0,2595,Skylit Midtown Castle,150,30,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,5121,BlissArtsSpace!,60,30,4.52,https://a0.muscache.com/im/pictures/2090980c-b...,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5178,Large Furnished Room Near B'way,68,2,4.22,https://a0.muscache.com/im/pictures/12065/f070...,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
5,5203,Cozy Clean Guest Room - Family Apt,75,2,4.91,https://a0.muscache.com/im/pictures/103776/b37...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",98,4,4.7,https://a0.muscache.com/im/pictures/2884180/f1...,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [54]:
room_user_vector_features = df.columns[6:]
df[room_user_vector_features]

Unnamed: 0,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Entire home/apt,Hotel room,Private room,Shared room,Allerton,...,151-180,181-210,211-240,241-270,271-300,301-330,31-60,331-365,61-90,91-120
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28738,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
28739,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
28741,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
28742,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


### **Normalizing the columns**

In [55]:
# We normalize the columns by multiplying the encoded value 1 by the average rating of the room and then divide it by 5 (maximum rating).
def normalize_df(df: pd.DataFrame, cols_to_norm: list) -> pd.DataFrame:
    df = df.copy()
    for col in cols_to_norm:
        df[col] = df[col] * df.rating / 5
    return df

In [56]:
df = normalize_df(df, room_user_vector_features)
df

Unnamed: 0,id,name,price,minimum_nights,rating,images,Bronx,Brooklyn,Manhattan,Queens,...,151-180,181-210,211-240,241-270,271-300,301-330,31-60,331-365,61-90,91-120
0,2595,Skylit Midtown Castle,150,30,4.70,https://a0.muscache.com/im/pictures/f0813a11-4...,0.0,0.000,0.940,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.940,0.0,0.0
2,5121,BlissArtsSpace!,60,30,4.52,https://a0.muscache.com/im/pictures/2090980c-b...,0.0,0.904,0.000,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.904,0.0,0.0
4,5178,Large Furnished Room Near B'way,68,2,4.22,https://a0.muscache.com/im/pictures/12065/f070...,0.0,0.000,0.844,0.0,...,0.000,0.844,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
5,5203,Cozy Clean Guest Room - Family Apt,75,2,4.91,https://a0.muscache.com/im/pictures/103776/b37...,0.0,0.000,0.982,0.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
6,5803,"Lovely Room 1, Garden, Best Area, Legal rental",98,4,4.70,https://a0.muscache.com/im/pictures/2884180/f1...,0.0,0.940,0.000,0.0,...,0.000,0.000,0.0,0.0,0.0,0.94,0.0,0.000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28738,53560361,*NEW* Private Room for travelers in NYC!,58,1,4.18,https://a0.muscache.com/im/pictures/miso/Hosti...,0.0,0.836,0.000,0.0,...,0.836,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28739,53560411,Beautiful 1-Bedroom in a Prime Location in NYC,60,1,4.33,https://a0.muscache.com/im/pictures/miso/Hosti...,0.0,0.866,0.000,0.0,...,0.866,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28741,53572271,Beautiful Private Room In a shared apartment,71,1,4.60,https://a0.muscache.com/im/pictures/782eae85-b...,0.0,0.000,0.920,0.0,...,0.920,0.000,0.0,0.0,0.0,0.00,0.0,0.000,0.0,0.0
28742,53572597,Astoria Luxury suite 2A,89,1,5.00,https://a0.muscache.com/im/pictures/b5b3b2e7-c...,0.0,0.000,0.000,1.0,...,0.000,0.000,0.0,0.0,0.0,0.00,0.0,1.000,0.0,0.0


In [57]:
df.to_csv('../data/New_York_Airbnb_4_dec_2021_final.csv', index=False)