# Data Preprocessing

In [40]:
# importing libraries
from utils.DataIngestion import get_path, get_config

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os

In [2]:
# getting the path
path = get_path()
print(f"Path: {path}")

Path: D:\Machine Learning\MACHINE LEARNING PROJECTS\airbnb_price_prediction\data


In [3]:
# getting the config and file information
config = get_config("data_preparation_config.yml")
file = config['file_names']['pickled_output_dataframe']
print(f"File: {file}")

Path, path_to_yaml: D:\Machine Learning\MACHINE LEARNING PROJECTS\airbnb_price_prediction\notebooks\data_preparation_config.yml
File: AB_NYC_2019_output_nov8.pkl


In [59]:
# reading the data
df = pd.read_pickle(os.path.join(path, file))
df.reset_index(inplace=True)
print(df.shape)
df.head()

(44977, 10)


Unnamed: 0,index,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,0,Brooklyn,Kensington,Private room,149,1,9,0.21,6,365
1,1,Manhattan,Midtown,Entire home/apt,225,1,45,0.38,2,355
2,2,Manhattan,Harlem,Private room,150,3,0,0.0,1,365
3,3,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,4.64,1,194
4,4,Manhattan,East Harlem,Entire home/apt,80,10,9,0.1,1,0


In [60]:
# null values:
df.isnull().sum()

index                             0
neighbourhood_group               0
neighbourhood                     0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [61]:
# dtypes
df.dtypes

index                               int64
neighbourhood_group                object
neighbourhood                      object
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [62]:
# features list 
list(df)

['index',
 'neighbourhood_group',
 'neighbourhood',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [63]:
# independent and dependent features
dependent_feat = ['price']
independent_feat = ['neighbourhood_group', 'neighbourhood','room_type', 'minimum_nights', 'number_of_reviews', 
                    'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

In [64]:
# continuous and categorical features:
categorical_feat = ['neighbourhood_group', 'room_type', 'neighbourhood']
cont_feat = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

In [65]:
# Number of unique categories in categorical features:
print(f"Neighbourhood Group: {df['neighbourhood_group'].nunique()} values")
print(f"Neighbourhoods: {df['neighbourhood'].nunique()} values")
print(f"Room Type: {df['room_type'].nunique()} values")

Neighbourhood Group: 5 values
Neighbourhoods: 219 values
Room Type: 3 values


#### Dependent Feature:

In [66]:
y = df[['price']]

#### Encoding the Categorical Features

- First -> encoding with the neighbourhood feature.
- Second -> encoding without the neighourhood feature as there are 219 unique values.

In [67]:
encoded_with = pd.get_dummies(df[categorical_feat])
encoded_without = pd.get_dummies(df[['neighbourhood_group', 'room_type']])

In [68]:
encoded_with.head()

Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_Allerton,neighbourhood_Arden Heights,...,neighbourhood_Westchester Square,neighbourhood_Westerleigh,neighbourhood_Whitestone,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodside
0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
encoded_without.head()

Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,0,1,0,0,0,0,1,0
1,0,0,1,0,0,1,0,0
2,0,0,1,0,0,0,1,0
3,0,1,0,0,0,1,0,0
4,0,0,1,0,0,1,0,0


#### Numerical Features

In [70]:
# list of numerical features
cont_feat

['minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [72]:
cont_df = df[cont_feat]
print(cont_df.shape)
cont_df.head()

(44977, 5)


Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,1,9,0.21,6,365
1,1,45,0.38,2,355
2,3,0,0.0,1,365
3,1,270,4.64,1,194
4,10,9,0.1,1,0


*Scaling the data:*

In [73]:
scaler = StandardScaler()
cont_df = pd.DataFrame(scaler.fit_transform(cont_df), columns=cont_feat)
cont_df.tail()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
44972,-0.452185,-0.530167,-0.688475,-0.14503,-0.766197
44973,-0.219526,-0.530167,-0.688475,-0.14503,-0.558565
44974,0.478453,-0.530167,-0.688475,-0.178543,-0.627776
44975,-0.568515,-0.530167,-0.688475,-0.010976,-0.820027
44976,0.129464,-0.530167,-0.688475,-0.178543,-0.658536


In [74]:
cont_df.shape

(44977, 5)

#### Combining the Numerical and Categorical Features

In [75]:
X_with_neighbourhood = pd.concat([encoded_with, cont_df], axis=1)
X_without_neighbourhood = pd.concat([encoded_without, cont_df], axis=1)

In [76]:
print(X_with_neighbourhood.shape)
X_with_neighbourhood.head()

(44977, 232)


Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_Allerton,neighbourhood_Arden Heights,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodside,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,-0.568515,-0.332158,-0.558263,-0.010976,1.97147
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,-0.568515,0.459877,-0.452854,-0.14503,1.894569
2,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,-0.335855,-0.530167,-0.688475,-0.178543,1.97147
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,-0.568515,5.410101,2.18859,-0.178543,0.656467
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0.478453,-0.332158,-0.62647,-0.178543,-0.835408


In [77]:
print(X_without_neighbourhood.shape)
X_without_neighbourhood.head()

(44977, 13)


Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,0,1,0,0,0,0,1,0,-0.568515,-0.332158,-0.558263,-0.010976,1.97147
1,0,0,1,0,0,1,0,0,-0.568515,0.459877,-0.452854,-0.14503,1.894569
2,0,0,1,0,0,0,1,0,-0.335855,-0.530167,-0.688475,-0.178543,1.97147
3,0,1,0,0,0,1,0,0,-0.568515,5.410101,2.18859,-0.178543,0.656467
4,0,0,1,0,0,1,0,0,0.478453,-0.332158,-0.62647,-0.178543,-0.835408


*Final Datasets with Independent and Dependent features:*

In [78]:
df_without_neighbourhood = pd.concat([X_without_neighbourhood, y], axis=1)
df_with_neighbourhood = pd.concat([X_with_neighbourhood, y], axis=1)

In [79]:
print(df_with_neighbourhood.shape)
print(df_without_neighbourhood.shape)

(44977, 233)
(44977, 14)


In [80]:
# saving the dataframes as CSV
print("Saving the Processed Data Files..")
df_with_neighbourhood.to_pickle(os.path.join(path, "final_with_neighbourhood.pkl"))
df_without_neighbourhood.to_pickle(os.path.join(path, "final_without_neighbourhood.pkl"))

Saving the Processed Data Files..
