### A Look at the Data

In order to get a better understanding of the data we will be looking at throughout this lesson, let's take a look at some of the characteristics of the dataset.

First, let's read in the data and necessary libraries.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
%matplotlib inline

b_cal = pd.read_csv('boston_calendar.csv')
b_list = pd.read_csv('boston_listings.csv')
b_rev = pd.read_csv('boston_reviews.csv')

s_cal = pd.read_csv('seatle_calendar.csv')
s_list = pd.read_csv('seatle_listings.csv')
s_rev = pd.read_csv('seatle_reviews.csv')

Check the sizes of cols and rows 

In [15]:
b_cal.head(2)

Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,12147973,2017-09-05,f,,2017,9,5,Tuesday
1,12147973,2017-09-04,f,,2017,9,4,Monday


In [16]:
s_cal.head(2)

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00


In [17]:
print("b_cal  :", b_cal.shape)
print("s_cal  :", s_cal.shape)

b_cal  : (1308890, 8)
s_cal  : (1393570, 4)


Check the Nans

In [4]:
print(b_cal.info())
print('\n')
print("Null prop of price column: ", b_cal.price.isnull().sum()/b_cal.shape[0])
print("Proportion of False(unit unavailable at this date ):", b_cal.available[b_cal.available =='f' ].count()/b_cal.shape[0])

#Since the proportions are the same, then, all nans are when the units aren't available, let's be more sure
b_cal[b_cal['available']== 't']['price'].isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308890 entries, 0 to 1308889
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   listing_id  1308890 non-null  int64 
 1   date        1308890 non-null  object
 2   available   1308890 non-null  object
 3   price       643037 non-null   object
dtypes: int64(1), object(3)
memory usage: 39.9+ MB
None


Null prop of price column:  0.5087157820748879
Proportion of False(unit unavailable at this date ): 0.5087157820748879


0

Transfer Date column to datetime to ease manipulation, analysis and modeling. Check the period in which the data was collected. I create a dataframe with seperate date items and append it after

In [7]:
b_cal['date'] = pd.to_datetime(b_cal.date)
print(b_cal.date.dtype)
print('\n')

b_date_df = pd.DataFrame()
b_date_df['year'] = b_cal['date'].dt.year
b_date_df['month'] = b_cal['date'].dt.month
b_date_df['day'] =b_cal['date'].dt.day
b_date_df['dow'] =b_cal['date'].dt.strftime("%A")
b_cal = b_cal.join(b_date_df)
b_cal.head()

datetime64[ns]




Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,12147973,2017-09-05,f,,2017,9,5,Tuesday
1,12147973,2017-09-04,f,,2017,9,4,Monday
2,12147973,2017-09-03,f,,2017,9,3,Sunday
3,12147973,2017-09-02,f,,2017,9,2,Saturday
4,12147973,2017-09-01,f,,2017,9,1,Friday


In [8]:
b_period =pd.DataFrame(b_cal.groupby(['year'], sort = True)['month'].value_counts())
b_period = b_period.rename(columns={'month':'count'}, level=0)
b_period = b_period.reset_index().sort_values(by=['year', 'month']).reset_index(drop = True)
b_period['obs_prop_per_month'] = np.round(b_period['count']/b_cal.shape[0]*100,1)
b_period.drop('count', inplace =True, axis = 1)
b_period

Unnamed: 0,year,month,obs_prop_per_month
0,2016,9,6.8
1,2016,10,8.5
2,2016,11,8.2
3,2016,12,8.5
4,2017,1,8.5
5,2017,2,7.7
6,2017,3,8.5
7,2017,4,8.2
8,2017,5,8.5
9,2017,6,8.2


In [9]:
b_list.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3


In [None]:
print("b_list :", b_list.shape)
print("b_rev  :", b_rev.shape)