# Importing the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 
import statistics
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Data

In [2]:
abnb_calendar1_df = pd.read_csv('calendar_1.csv.gz')

abnb_calendar2_df = pd.read_csv('calendar.csv.gz')

abnb_calendar_df = pd.concat([abnb_calendar1_df,abnb_calendar2_df],  axis = 0).reset_index(drop=True)
abnb_calendar_df

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,649100413671762836,2022-06-16,f,$114.00,$114.00,1.0,365.0
1,649100413671762836,2022-06-17,f,$114.00,$114.00,1.0,365.0
2,649100413671762836,2022-06-18,f,$114.00,$114.00,1.0,365.0
3,649100413671762836,2022-06-19,f,$114.00,$114.00,1.0,365.0
4,649100413671762836,2022-06-20,f,$114.00,$114.00,1.0,365.0
...,...,...,...,...,...,...,...
3316751,1778252,2023-06-11,t,$100.00,$100.00,21.0,730.0
3316752,1778252,2023-06-12,t,$100.00,$100.00,21.0,730.0
3316753,1778252,2023-06-13,t,$100.00,$100.00,21.0,730.0
3316754,1778252,2023-06-14,t,$100.00,$100.00,21.0,730.0


# Discovery

In [3]:
abnb_calendar_df.isna().sum()

listing_id          0
date                0
available           0
price             183
adjusted_price    183
minimum_nights     14
maximum_nights     14
dtype: int64

In [4]:
abnb_calendar_df.duplicated().sum()
abnb_calendar_df = abnb_calendar_df.drop_duplicates()

In [5]:
abnb_calendar_df.isna().sum()

listing_id          0
date                0
available           0
price             183
adjusted_price    183
minimum_nights     14
maximum_nights     14
dtype: int64

In [6]:
abnb_calendar_df.duplicated().sum()

0

In [7]:
abnb_calendar_df_rented = abnb_calendar_df[abnb_calendar_df['available'] == 'f' ]
abnb_calendar_df_rented.reset_index(drop = True, inplace=True)
abnb_calendar_df_rented

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,649100413671762836,2022-06-16,f,$114.00,$114.00,1.0,365.0
1,649100413671762836,2022-06-17,f,$114.00,$114.00,1.0,365.0
2,649100413671762836,2022-06-18,f,$114.00,$114.00,1.0,365.0
3,649100413671762836,2022-06-19,f,$114.00,$114.00,1.0,365.0
4,649100413671762836,2022-06-20,f,$114.00,$114.00,1.0,365.0
...,...,...,...,...,...,...,...
1589619,1758525,2023-06-11,f,$115.00,$115.00,2.0,30.0
1589620,1758525,2023-06-12,f,$115.00,$115.00,2.0,30.0
1589621,1758525,2023-06-13,f,$115.00,$115.00,2.0,30.0
1589622,1758525,2023-06-14,f,$115.00,$115.00,2.0,30.0


In [8]:
abnb_calendar_df_rented.isna().sum()

listing_id         0
date               0
available          0
price             48
adjusted_price    48
minimum_nights    14
maximum_nights    14
dtype: int64

In [9]:
abnb_calendar_df_rented['price'] = abnb_calendar_df_rented['price'].fillna('$115.00')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnb_calendar_df_rented['price'] = abnb_calendar_df_rented['price'].fillna('$115.00')


In [10]:
abnb_calendar_df_rented['adjusted_price'] = abnb_calendar_df_rented['adjusted_price'].fillna('$115.00')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnb_calendar_df_rented['adjusted_price'] = abnb_calendar_df_rented['adjusted_price'].fillna('$115.00')


In [11]:
abnb_calendar_df_rented = abnb_calendar_df_rented.drop(columns = ['minimum_nights','maximum_nights'])

In [12]:
abnb_calendar_df_rented

Unnamed: 0,listing_id,date,available,price,adjusted_price
0,649100413671762836,2022-06-16,f,$114.00,$114.00
1,649100413671762836,2022-06-17,f,$114.00,$114.00
2,649100413671762836,2022-06-18,f,$114.00,$114.00
3,649100413671762836,2022-06-19,f,$114.00,$114.00
4,649100413671762836,2022-06-20,f,$114.00,$114.00
...,...,...,...,...,...
1589619,1758525,2023-06-11,f,$115.00,$115.00
1589620,1758525,2023-06-12,f,$115.00,$115.00
1589621,1758525,2023-06-13,f,$115.00,$115.00
1589622,1758525,2023-06-14,f,$115.00,$115.00


In [13]:
abnb_calendar_df_rented['price'] = abnb_calendar_df_rented['price'].str.replace('$', '')
abnb_calendar_df_rented['adjusted_price'] = abnb_calendar_df_rented['adjusted_price'].str.replace('$', '')

In [14]:
abnb_calendar_df_rented

Unnamed: 0,listing_id,date,available,price,adjusted_price
0,649100413671762836,2022-06-16,f,114.00,114.00
1,649100413671762836,2022-06-17,f,114.00,114.00
2,649100413671762836,2022-06-18,f,114.00,114.00
3,649100413671762836,2022-06-19,f,114.00,114.00
4,649100413671762836,2022-06-20,f,114.00,114.00
...,...,...,...,...,...
1589619,1758525,2023-06-11,f,115.00,115.00
1589620,1758525,2023-06-12,f,115.00,115.00
1589621,1758525,2023-06-13,f,115.00,115.00
1589622,1758525,2023-06-14,f,115.00,115.00


In [15]:
test = abnb_calendar_df_rented['price'] == abnb_calendar_df_rented['adjusted_price']

In [16]:
test.value_counts()

True     1564366
False      25258
dtype: int64

In [17]:
abnb_calendar_df_rented.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1589624 entries, 0 to 1589623
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   listing_id      1589624 non-null  int64 
 1   date            1589624 non-null  object
 2   available       1589624 non-null  object
 3   price           1589624 non-null  object
 4   adjusted_price  1589624 non-null  object
dtypes: int64(1), object(4)
memory usage: 60.6+ MB


In [18]:
abnb_calendar_df_rented['date']=pd.to_datetime(abnb_calendar_df_rented['date'])

In [19]:
abnb_calendar_df_rented.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1589624 entries, 0 to 1589623
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   listing_id      1589624 non-null  int64         
 1   date            1589624 non-null  datetime64[ns]
 2   available       1589624 non-null  object        
 3   price           1589624 non-null  object        
 4   adjusted_price  1589624 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 60.6+ MB


In [20]:
abnb_calendar_df_rented['year'] = pd.DatetimeIndex(abnb_calendar_df_rented['date']).year

In [21]:
abnb_calendar_df_rented['month_name'] = abnb_calendar_df_rented['date'].dt.month_name()

In [22]:
abnb_calendar_df_rented

Unnamed: 0,listing_id,date,available,price,adjusted_price,year,month_name
0,649100413671762836,2022-06-16,f,114.00,114.00,2022,June
1,649100413671762836,2022-06-17,f,114.00,114.00,2022,June
2,649100413671762836,2022-06-18,f,114.00,114.00,2022,June
3,649100413671762836,2022-06-19,f,114.00,114.00,2022,June
4,649100413671762836,2022-06-20,f,114.00,114.00,2022,June
...,...,...,...,...,...,...,...
1589619,1758525,2023-06-11,f,115.00,115.00,2023,June
1589620,1758525,2023-06-12,f,115.00,115.00,2023,June
1589621,1758525,2023-06-13,f,115.00,115.00,2023,June
1589622,1758525,2023-06-14,f,115.00,115.00,2023,June


In [23]:
from datetime import date, datetime

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]

def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)

In [24]:
abnb_calendar_df_rented['season'] = abnb_calendar_df_rented['date'].apply(get_season)

In [25]:
abnb_calendar_df_rented

Unnamed: 0,listing_id,date,available,price,adjusted_price,year,month_name,season
0,649100413671762836,2022-06-16,f,114.00,114.00,2022,June,spring
1,649100413671762836,2022-06-17,f,114.00,114.00,2022,June,spring
2,649100413671762836,2022-06-18,f,114.00,114.00,2022,June,spring
3,649100413671762836,2022-06-19,f,114.00,114.00,2022,June,spring
4,649100413671762836,2022-06-20,f,114.00,114.00,2022,June,spring
...,...,...,...,...,...,...,...,...
1589619,1758525,2023-06-11,f,115.00,115.00,2023,June,spring
1589620,1758525,2023-06-12,f,115.00,115.00,2023,June,spring
1589621,1758525,2023-06-13,f,115.00,115.00,2023,June,spring
1589622,1758525,2023-06-14,f,115.00,115.00,2023,June,spring


In [27]:
abnb_calendar_df_rented.to_csv("/Users/begumerdem/IH-Labs/WEEK_5_Lab/archive/abnb_calendar_df_rented.csv", index = False)