### A Look at the Data

In order to get a better understanding of the data we will be looking at throughout this lesson, let's take a look at some of the characteristics of the dataset.

First, let's read in the data and necessary libraries.

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [80]:
b_cal = pd.read_csv('boston_calendar.csv')
b_list = pd.read_csv('boston_listings.csv')
b_rev = pd.read_csv('boston_reviews.csv')

s_cal = pd.read_csv('seatle_calendar.csv')
s_list = pd.read_csv('seatle_listings.csv')
s_rev = pd.read_csv('seatle_reviews.csv')

## Task 1: Busines Understanding

### Step 1: Basic Exploration with minimal cleaning
*To familiarize with the Data and to gather insights to formulate questions*

> **Boston & Seatle Calendar**

#### Check the sizes of cols and rows 

In [81]:
print(b_cal.head(2)),print('\n'), print(s_cal.head(2))
print('\n')
print("b_cal  :", b_cal.shape)
print("s_cal  :", s_cal.shape)

   listing_id        date available price
0    12147973  2017-09-05         f   NaN
1    12147973  2017-09-04         f   NaN


   listing_id        date available   price
0      241032  2016-01-04         t  $85.00
1      241032  2016-01-05         t  $85.00


b_cal  : (1308890, 4)
s_cal  : (1393570, 4)


#### Check Nulls

In [82]:
print('Boston Cal: ')
print("Cols with nulls: ", b_cal.isnull().sum()[b_cal.isnull().sum()>0].index[0])
print("Null prop of price column: ", b_cal.price.isnull().sum()/b_cal.shape[0])
print("Proportion of False(unit unavailable at this date ):", b_cal.available[b_cal.available =='f' ].count()/b_cal.shape[0])
#Since the proportions are the same, then, all nans are when the units aren't available, let's be more sure
print("Nulls when units are available: ", b_cal[b_cal['available']== 't']['price'].isnull().sum())
print('\n')
print('Seatle Cal: ')
print("Cols with nulls: ", s_cal.isnull().sum()[s_cal.isnull().sum()>0].index[0])
print("Null prop of price column: ", s_cal.price.isnull().sum()/s_cal.shape[0])
print("Proportion of False(unit unavailable at this date ):", s_cal.available[s_cal.available =='f' ].count()/s_cal.shape[0])
#Since the proportions are the same, then, all nans are when the units aren't available, let's be more sure
print("Nulls when units are available: ", s_cal[s_cal['available']== 't']['price'].isnull().sum())


Boston Cal: 
Cols with nulls:  price
Null prop of price column:  0.5087157820748879
Proportion of False(unit unavailable at this date ): 0.5087157820748879
Nulls when units are available:  0


Seatle Cal: 
Cols with nulls:  price
Null prop of price column:  0.32938998399793334
Proportion of False(unit unavailable at this date ): 0.32938998399793334
Nulls when units are available:  0


#### Transfer Date column to datetime to ease manipulation, analysis and modeling. I create a dataframe with seperate date items from the Date column, to check the period in which the data was collected.

In [83]:
def create_dateparts(df, date_col):
    
    df['date'] = pd.to_datetime(df.date)
    print(df.date.dtype== 'datetime64[ns]')

    b_date_df = pd.DataFrame()
    b_date_df['year'] = df['date'].dt.year
    b_date_df['month'] = df['date'].dt.month
    b_date_df['day'] =df['date'].dt.day
    b_date_df['dow'] =df['date'].dt.strftime("%A")
    df = df.join(b_date_df)
    return df

In [91]:
b_cal_1 = create_dateparts(b_cal, 'date')
b_cal_1.head(3)

True


Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,12147973,2017-09-05,f,,2017,9,5,Tuesday
1,12147973,2017-09-04,f,,2017,9,4,Monday
2,12147973,2017-09-03,f,,2017,9,3,Sunday


In [92]:
s_cal_1 = create_dateparts(s_cal, 'date')
s_cal_1.head(3)

True


Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,241032,2016-01-04,t,$85.00,2016,1,4,Monday
1,241032,2016-01-05,t,$85.00,2016,1,5,Tuesday
2,241032,2016-01-06,f,,2016,1,6,Wednesday


In [98]:
print("Number of unique Listing IDs in Boston Calendar: ", len(b_cal_1.listing_id.unique()))
print("Number of unique Listing IDs in Seatle Calendar: ", len(s_cal_1.listing_id.unique()))

Number of unique Listing IDs in Boston Calendar:  3585
Number of unique Listing IDs in Seatle Calendar:  3818


In [93]:
def get_period_df(df):
    period =pd.DataFrame(df.groupby(['year','month'], sort = True)['day'].value_counts())
    period = period.rename(columns={'day':'count'}, level=0)
    period = period.reset_index().sort_values(by=['year', 'month']).reset_index(drop = True)
    return period

In [96]:
b_period =get_period_df(b_cal_1)
b_period

Unnamed: 0,year,month,day,count
0,2016,9,6,3586
1,2016,9,7,3586
2,2016,9,8,3586
3,2016,9,9,3586
4,2016,9,10,3586
...,...,...,...,...
360,2017,9,1,3586
361,2017,9,2,3586
362,2017,9,3,3586
363,2017,9,4,3586


In [97]:
s_period =get_period_df(s_cal_1)
s_period

Unnamed: 0,year,month,day,count
0,2016,1,4,3818
1,2016,1,5,3818
2,2016,1,6,3818
3,2016,1,7,3818
4,2016,1,8,3818
...,...,...,...,...
360,2016,12,29,3818
361,2016,12,30,3818
362,2016,12,31,3818
363,2017,1,1,3818


In [130]:
print(b_period['count'].unique())
print(s_period['count'].unique())

[3586]
[3818]


#### Counts are the equivalent to the numbers of unique ids because all the ids are spanning the same time period by day.  Let's check any anomalies

In [105]:
def check_anomalies(df, col):
    list_ids_not_year_long = []

    for i in sorted(list(df.col.unique())):
        if df[df[col]== i].shape[0] != 365:
            list_ids_not_year_long.append(i)
    print("Entry Id that doesn't span 1 year: " , list_ids_not_year_long)

In [106]:
#Boston
check_anomalies(b_cal_1, 'listing_id')

Entry Id that doesn't span 1 year:  [12898806]


In [107]:
#Seatle
check_anomalies(s_cal_1, 'listing_id')

Entry Id that doesn't span 1 year:  []


In [123]:
## check this entry in Boston Calendar
print("Span of the entries for this listing, should be 365: ", b_cal_1[b_cal_1['listing_id']== 12898806].shape[0])
## 2 years, seems like a duplicate as 730 = 365 * 2
one_or_two = pd.DataFrame(b_cal_1[b_cal_1['listing_id']==12898806].groupby(['year', 'month', 'day'])['day'].count()).day.unique()[0]
print("Should be 1: ", one_or_two)
## It indeed is :)

Span of the entries for this listing, should be 365:  730
Should be 1:  2


In [None]:
### Manage  this duplcate

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
### Comments:  
[Boston & Seatle Calendar]
- The datasets have information about listing dates, availability and price tracked over a year for ever listing id
- There are no data entry errors, all nulls are due to the structuring of the Data (the listings that aren't available has no price)
<br><br>
- The Boston calendar Dataset ranges from `September'16` to `September'17`, No nulls 
- It has `1308890` rows and  `4` cols
- I added 4 cols that contain dateparts that will aid further analysis and model
- The Seatle calendar Dataset ranges from `January'16` to `January'17`, No nulls
- It has `1393570` rows and  `4` cols
- I added 4 cols that contain dateparts that will aid further analysis and model
<br><br>
- Number of unique Listing IDs in Boston Calendar:  `3585`
- Number of unique Listing IDs in Seatle Calendar:  `3818`
- Listing id `12898806` is a duplicate, has to be fixed!

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 _______________________________________________________________________________________________________________________

## Step 1: Continue - 

> **Boston & Seatle Listings**

In [18]:
b_list.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,


In [34]:
s_list.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07


 ### Check the sizes of cols and rows 

In [94]:
print("Boston listings size :", b_list.shape)
print("Seatle listings size :", s_list.shape)
set_difference = set(b_list.columns) - set(s_list.columns)
print("Columns in Boston but not in Seatle:  ", set_difference)

Boston listings size : (3585, 95)
Seatle listings size : (3818, 92)
Columns in Boston but not in Seatle:   {'house_rules', 'access', 'interaction'}


In [93]:
b_list[['house_rules', 'access', 'interaction']].head(3)

Unnamed: 0,house_rules,access,interaction
0,Clean up and treat the home the way you'd like...,"You will have access to 2 bedrooms, a living r...",
1,Pet friendly but please confirm with me if the...,Apt has one more bedroom (which I use) and lar...,"If I am at home, I am likely working in my hom..."
2,"I encourage you to use my kitchen, cooking and...","I am living in the apartment during your stay,...","ABOUT ME: I'm a laid-back, friendly, unmarried..."


### Check Nulls

In [115]:
print("Number of Non-null cols in Boston listings: ",  np.sum(b_list.isnull().sum()==0))
print("Number of Non-null cols in Seatle listings: ",  np.sum(s_list.isnull().sum()==0))

Number of Non-null cols in Boston listings:  51
Number of Non-null cols in Seatle listings:  47


In [123]:
# For Boston Listings The null proportion per column
print(b_list.isnull().sum()[b_list.isnull().sum()>0]/b_list.shape[0])

summary                         0.039888
space                           0.294840
neighborhood_overview           0.394700
notes                           0.550907
transit                         0.359833
access                          0.415342
interaction                     0.433473
house_rules                     0.332497
thumbnail_url                   0.167085
medium_url                      0.167085
xl_picture_url                  0.167085
host_location                   0.003068
host_about                      0.365132
host_response_time              0.131381
host_response_rate              0.131381
host_acceptance_rate            0.131381
host_neighbourhood              0.094561
neighbourhood                   0.151464
neighbourhood_group_cleansed    1.000000
city                            0.000558
zipcode                         0.010600
market                          0.003905
property_type                   0.000837
bathrooms                       0.003905
bedrooms        

In [120]:
# For Seatle Listings The null proportion per column
s_list.isnull().sum()[s_list.isnull().sum()>0]/s_list.shape[0]*100

summary                          4.635935
space                           14.903091
neighborhood_overview           27.029859
notes                           42.063908
transit                         24.463070
thumbnail_url                    8.381351
medium_url                       8.381351
xl_picture_url                   8.381351
host_name                        0.052383
host_since                       0.052383
host_location                    0.209534
host_about                      22.498690
host_response_time              13.698271
host_response_rate              13.698271
host_acceptance_rate            20.246202
host_is_superhost                0.052383
host_thumbnail_url               0.052383
host_picture_url                 0.052383
host_neighbourhood               7.857517
host_listings_count              0.052383
host_total_listings_count        0.052383
host_has_profile_pic             0.052383
host_identity_verified           0.052383
neighbourhood                   10

# Examine columns closely to prep them correctly!

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
### Comments:  
[Boston & Seatle Listings]
- Boston listings size : `3585`, `95`
- Seatle listings size : `3818`, `92`
- Boston Listings has 3 cols that arent in Seatle, `house_rules`, `access`, `interaction`, types: `Object/String`
- Number of Non-null cols in Boston listings:  `51`, around half
- Number of Non-null cols in Seatle listings:  `47`, around half
