In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Functions.functions import convert_price, convert_rate

In [95]:
AirbnbData = pd.read_csv("Data/airbnb_listings.csv")

In [101]:
AirbnbData['price']

0        $70.00
1        $67.00
2        $45.00
3        $80.00
4       $251.00
         ...   
7561    $200.00
7562     $48.00
7563     $64.00
7564    $240.00
7565    $280.00
Name: price, Length: 7566, dtype: object

In [102]:
s = '$,'
AirbnbData['price'] = AirbnbData['price'].replace('['+s+']','',regex=True).astype(float)

In [103]:
AirbnbData['price']

0        70.0
1        67.0
2        45.0
3        80.0
4       251.0
        ...  
7561    200.0
7562     48.0
7563     64.0
7564    240.0
7565    280.0
Name: price, Length: 7566, dtype: float64

In [3]:
# Effect of Amenities on Price
amenities = ['amenities', 'price']

# Minimum Nights Requirement and Its Effect on Booking Rate
booking_rate = ['minimum_nights', 'number_of_reviews_ltm', 'availability_365']

# Impact of Property Type and Room Type on Price and Review Scores
property = ['property_type', 'room_type', 'price', 'review_scores_rating']

# Effect of Number of Reviews on Future Availability
future_availability = ['number_of_reviews', 'availability_365']

# Impact of Superhost Status on Price and Occupancy Rates
superhost = ['host_is_superhost', 'price', 'availability_365']

# Seasonal Trends in Booking and Reviews
booking_reviews = ['last_review', 'availability_365', 'number_of_reviews']

In [4]:
features = ['id','description','neighborhood_overview','host_location','host_response_time','host_response_rate','host_acceptance_rate',
            'host_is_superhost','host_neighbourhood','host_verifications','host_identity_verified','neighbourhood_cleansed',
            'latitude','longitude','property_type','room_type','accommodates','bathrooms_text','bedrooms','beds','amenities','price',
            'minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights',
            'minimum_nights_avg_ntm','maximum_nights_avg_ntm','has_availability','availability_30','availability_60','availability_90',
            'availability_365','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','review_scores_rating','review_scores_accuracy',
            'review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value',
            'instant_bookable','reviews_per_month']

In [5]:
features_g1 = features[:10] #0 to 9
features_g2 = features[10:20] #10 to 19
features_g3 = features[20:30] #20 to 29
features_g4 = features[30:40] #30 to 39
features_g5 = features[40:] #40 to 47

## **Cleaning the variables for the second part**

### First features group

In [6]:
# First features group
AirbnbData[features_g1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     7566 non-null   int64 
 1   description            7411 non-null   object
 2   neighborhood_overview  4194 non-null   object
 3   host_location          6232 non-null   object
 4   host_response_time     3932 non-null   object
 5   host_response_rate     3932 non-null   object
 6   host_acceptance_rate   4191 non-null   object
 7   host_is_superhost      7566 non-null   object
 8   host_neighbourhood     5234 non-null   object
 9   host_verifications     7566 non-null   object
dtypes: int64(1), object(9)
memory usage: 591.2+ KB


In [7]:
AirbnbData[features_g1].isna().sum()

id                          0
description               155
neighborhood_overview    3372
host_location            1334
host_response_time       3634
host_response_rate       3634
host_acceptance_rate     3375
host_is_superhost           0
host_neighbourhood       2332
host_verifications          0
dtype: int64

In [8]:
# Making a copy for working with the first group
features_g1DF = AirbnbData[features_g1].copy()

In [9]:
features_g1DF.fillna({'description':'No comments'}, inplace=True)
features_g1DF.fillna({'neighborhood_overview':'No comments'}, inplace=True)
features_g1DF.fillna({'host_location':'No location'}, inplace=True)

In [10]:
features_g1DF.host_response_time.value_counts()

host_response_time
within an hour        2628
within a few hours     748
within a day           465
a few days or more      91
Name: count, dtype: int64

In [11]:
# Setting the null value with the most common value
features_g1DF.fillna({'host_response_time':'within an hour'}, inplace=True)

In [12]:
features_g1DF['host_response_rate'] = convert_rate(features_g1DF,'host_response_rate')
features_g1DF['host_acceptance_rate'] = convert_rate(features_g1DF,'host_acceptance_rate')

# Replacing the null values for the mean rate
features_g1DF.fillna({'host_response_rate': round(features_g1DF['host_response_rate'].mean(),4)}, inplace=True)
features_g1DF.fillna({'host_acceptance_rate': round(features_g1DF['host_acceptance_rate'].mean(),4)}, inplace=True)

In [13]:
features_g1DF.fillna({'host_neighbourhood':'No location'}, inplace=True)

In [14]:
features_g1DF.isna().sum()

id                       0
description              0
neighborhood_overview    0
host_location            0
host_response_time       0
host_response_rate       0
host_acceptance_rate     0
host_is_superhost        0
host_neighbourhood       0
host_verifications       0
dtype: int64

### Second features group

In [15]:
AirbnbData[features_g2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   host_identity_verified  7566 non-null   object 
 1   neighbourhood_cleansed  7566 non-null   object 
 2   latitude                7566 non-null   float64
 3   longitude               7566 non-null   float64
 4   property_type           7566 non-null   object 
 5   room_type               7566 non-null   object 
 6   accommodates            7566 non-null   int64  
 7   bathrooms_text          7562 non-null   object 
 8   bedrooms                7361 non-null   float64
 9   beds                    7472 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 591.2+ KB


In [16]:
AirbnbData[features_g2].isna().sum()

host_identity_verified      0
neighbourhood_cleansed      0
latitude                    0
longitude                   0
property_type               0
room_type                   0
accommodates                0
bathrooms_text              4
bedrooms                  205
beds                       94
dtype: int64

In [17]:
features_g2DF = AirbnbData[features_g2].copy()

In [18]:
features_g2DF.bathrooms_text.value_counts()

bathrooms_text
1 bath               2581
1 shared bath        1495
1 private bath        978
2 baths               851
1.5 baths             569
1.5 shared baths      307
2.5 baths             275
3 baths               145
2 shared baths        121
3.5 baths              58
4 baths                37
2.5 shared baths       31
0 shared baths         19
0 baths                18
3 shared baths         16
4.5 baths              14
Shared half-bath       10
Half-bath               8
5 baths                 7
Private half-bath       4
3.5 shared baths        4
5.5 baths               3
4 shared baths          2
6 baths                 2
9.5 baths               1
7.5 baths               1
8.5 baths               1
6 shared baths          1
8 baths                 1
7 baths                 1
6.5 baths               1
Name: count, dtype: int64

## **Cleaning the variables for doing the trends**

### Amenities

In [19]:
# Checking null values
AirbnbData.loc[:,amenities].isna().sum()

amenities    0
price        0
dtype: int64

In [20]:
# Assigning the variable for creating the trend
amenitiesDF = AirbnbData.loc[:,amenities]

In [21]:
#Transforming data type prices

amenitiesDF['price'] = convert_price(amenitiesDF,'price')
amenitiesDF['price'].describe().T

count     7566.000000
mean       182.327953
std       1170.750214
min         10.000000
25%         65.000000
50%        105.000000
75%        192.750000
max      99149.000000
Name: price, dtype: float64

In [22]:
amenitiesDF

Unnamed: 0,amenities,price
0,"[""Oven"", ""Hot water kettle"", ""Fire extinguishe...",70.0
1,"[""Oven"", ""Fire extinguisher"", ""Dryer"", ""Hanger...",67.0
2,"[""Oven"", ""Fire extinguisher"", ""Hangers"", ""Carb...",45.0
3,"[""Oven"", ""Hot water kettle"", ""Fire extinguishe...",80.0
4,"[""Fire extinguisher"", ""Dryer"", ""Hangers"", ""Ind...",251.0
...,...,...
7561,"[""TV"", ""Lock on bedroom door"", ""Security camer...",200.0
7562,"[""Essentials"", ""Wifi"", ""Hair dryer"", ""Free par...",48.0
7563,"[""Oven"", ""Fire extinguisher"", ""Dryer"", ""Hanger...",64.0
7564,"[""Hot water kettle"", ""Free washer"", ""HDTV"", ""O...",240.0


### Booking rate

In [23]:
AirbnbData.loc[:,booking_rate].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   minimum_nights         7566 non-null   int64
 1   number_of_reviews_ltm  7566 non-null   int64
 2   availability_365       7566 non-null   int64
dtypes: int64(3)
memory usage: 177.5 KB


In [24]:
AirbnbData.loc[:,booking_rate].isna().sum()

minimum_nights           0
number_of_reviews_ltm    0
availability_365         0
dtype: int64

In [25]:
# Assigning the variable for creating the trend

booking_rateDF = AirbnbData.loc[:,booking_rate]
booking_rateDF.head()

Unnamed: 0,minimum_nights,number_of_reviews_ltm,availability_365
0,3,31,104
1,3,28,110
2,2,44,290
3,1,41,74
4,3,17,31


### Property

In [26]:
AirbnbData.loc[:,property].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   property_type         7566 non-null   object 
 1   room_type             7566 non-null   object 
 2   price                 7566 non-null   object 
 3   review_scores_rating  6209 non-null   float64
dtypes: float64(1), object(3)
memory usage: 236.6+ KB


In [27]:
AirbnbData.loc[:,property].isna().sum()

property_type              0
room_type                  0
price                      0
review_scores_rating    1357
dtype: int64

In [28]:
# Assigning the variable for creating the trend
propertyDF = AirbnbData.loc[:,property]

In [29]:
propertyDF.review_scores_rating.describe()

count    6209.000000
mean        4.603239
std         0.771441
min         0.000000
25%         4.560000
50%         4.810000
75%         5.000000
max         5.000000
Name: review_scores_rating, dtype: float64

In [30]:
# Replacing the null values with the mean
propertyDF.fillna({'review_scores_rating':round(propertyDF['review_scores_rating'].mean(),2)}, inplace=True)

In [31]:
propertyDF['price'] = convert_price(propertyDF,'price')

In [32]:
propertyDF.isna().sum()

property_type           0
room_type               0
price                   0
review_scores_rating    0
dtype: int64

In [33]:
propertyDF.head()

Unnamed: 0,property_type,room_type,price,review_scores_rating
0,Private room in bungalow,Private room,70.0,4.78
1,Private room in bungalow,Private room,67.0,4.79
2,Private room in home,Private room,45.0,4.74
3,Private room in townhouse,Private room,80.0,4.84
4,Entire rental unit,Entire home/apt,251.0,4.63


### Future Availability

In [34]:
AirbnbData.loc[:,future_availability].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   number_of_reviews  7566 non-null   int64
 1   availability_365   7566 non-null   int64
dtypes: int64(2)
memory usage: 118.3 KB


In [35]:
AirbnbData.loc[:,future_availability].isna().sum()

number_of_reviews    0
availability_365     0
dtype: int64

In [36]:
future_availabilityDF = AirbnbData.loc[:,future_availability]

In [37]:
future_availabilityDF.head()

Unnamed: 0,number_of_reviews,availability_365
0,258,104
1,210,110
2,355,290
3,326,74
4,64,31


### Superhost

In [38]:
AirbnbData.loc[:,superhost].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   host_is_superhost  7566 non-null   object
 1   price              7566 non-null   object
 2   availability_365   7566 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 177.5+ KB


In [39]:
AirbnbData.loc[:,superhost].isna().sum()

host_is_superhost    0
price                0
availability_365     0
dtype: int64

In [40]:
superhostDF = AirbnbData.loc[:,superhost]

In [41]:
superhostDF['price'] = convert_price(superhostDF,'price')

In [42]:
superhostDF.head()

Unnamed: 0,host_is_superhost,price,availability_365
0,t,70.0,104
1,t,67.0,110
2,f,45.0,290
3,t,80.0,74
4,f,251.0,31


### Booking review

In [43]:
AirbnbData.loc[:,booking_reviews].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   last_review        6209 non-null   object
 1   availability_365   7566 non-null   int64 
 2   number_of_reviews  7566 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 177.5+ KB


In [44]:
AirbnbData.loc[:,booking_reviews].isna().sum()

last_review          1357
availability_365        0
number_of_reviews       0
dtype: int64

In [45]:
booking_reviewsDF = AirbnbData.loc[:,booking_reviews]

In [46]:
booking_reviewsDF.last_review.value_counts()

last_review
2022-08-28    196
2022-09-11    171
2022-09-10    157
2022-09-04    136
2022-08-29    117
             ... 
2018-08-09      1
2020-08-05      1
2016-12-23      1
2017-04-09      1
2022-05-09      1
Name: count, Length: 1458, dtype: int64

In [47]:
print("Old shape:", booking_reviewsDF.shape)

booking_reviewsDF.dropna(inplace=True)
print("New shape:", booking_reviewsDF.shape)

Old shape: (7566, 3)
New shape: (6209, 3)


In [48]:
booking_reviewsDF.head()

Unnamed: 0,last_review,availability_365,number_of_reviews
0,2022-09-09,104,258
1,2022-09-11,110,210
2,2022-09-01,290,355
3,2022-08-31,74,326
4,2022-06-28,31,64
