### A Look at the Data

In order to get a better understanding of the data we will be looking at throughout this lesson, let's take a look at some of the characteristics of the dataset.

First, let's read in the data and necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mypy import print_side_by_side
from mypy import display_side_by_side
%matplotlib inline

In [2]:
b_cal = pd.read_csv('boston_calendar.csv')
b_list = pd.read_csv('boston_listings.csv')
b_rev = pd.read_csv('boston_reviews.csv')

s_cal = pd.read_csv('seatle_calendar.csv')
s_list = pd.read_csv('seatle_listings.csv')
s_rev = pd.read_csv('seatle_reviews.csv')

 _______________________________________________________________________________________________________________________

## Task 1: Busines Understanding

### Step 1: Basic Exploration with minimal cleaning
*To familiarize with the Data and to gather insights to formulate questions*

> **Boston & Seatle Calendar**

In [3]:
display_side_by_side(b_cal.head(), s_cal.head(), titles = ['b_cal', 's_cal'])

Unnamed: 0,listing_id,date,available,price
0,12147973,2017-09-05,f,
1,12147973,2017-09-04,f,
2,12147973,2017-09-03,f,
3,12147973,2017-09-02,f,
4,12147973,2017-09-01,f,

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,


#### Check the sizes of cols and rows & check Nulls

In [4]:
print_side_by_side('Boston Cal:', 'Seatle Cal:', b=0)
print_side_by_side('Shape:',b_cal.shape,"Shape:",  s_cal.shape)
print_side_by_side("Cols with nulls: ", b_cal.isnull().sum()[b_cal.isnull().sum()>0].index[0],"Cols with nulls: ", s_cal.isnull().sum()[s_cal.isnull().sum()>0].index[0])
print_side_by_side("Null prop of price column: ", round(b_cal.price.isnull().sum()/b_cal.shape[0], 2),"Null prop of price column: ", round(s_cal.price.isnull().sum()/s_cal.shape[0], 2))
print_side_by_side("Proportion of False(unit unavailable):", round(b_cal.available[b_cal.available =='f' ].count()/b_cal.shape[0],2),"Proportion of False(unit unavailable):", round(s_cal.available[s_cal.available =='f' ].count()/s_cal.shape[0],2))
print_side_by_side("Nulls when units are available: ", b_cal[b_cal['available']== 't']['price'].isnull().sum(),"Nulls when units are available: ", s_cal[s_cal['available']== 't']['price'].isnull().sum() )

Boston Cal:                                              Seatle Cal:
Shape:  (1308890 4)                                      Shape:  (1393570 4)
Cols with nulls:   price                                 Cols with nulls:   price
Null prop of price column:   0.51                        Null prop of price column:   0.33
Proportion of False(unit unavailable):  0.51             Proportion of False(unit unavailable):  0.33
Nulls when units are available:   0                      Nulls when units are available:   0


#### Transfer Date column to datetime to ease manipulation, analysis and modeling. I create a dataframe with seperate date items from the Date column, to check the period in which the data was collected. Transform Price to float.

In [5]:
def create_dateparts(df, date_col): 
    df['date'] = pd.to_datetime(df.date)
    b_date_df = pd.DataFrame()
    b_date_df['year'] = df['date'].dt.year
    b_date_df['month'] = df['date'].dt.month
    b_date_df['day'] =df['date'].dt.day
    b_date_df['dow'] =df['date'].dt.strftime("%A")
    df = df.join(b_date_df)
    return df
def get_period_df(df):
    period =pd.DataFrame(df.groupby(['year','month'], sort = True)['day'].value_counts())
    period = period.rename(columns={'day':'count'}, level=0)
    period = period.reset_index().sort_values(by=['year', 'month', 'day']).reset_index(drop = True)
    return period
def to_float(df, col):
    df[col]= df[col].astype(str).str.replace(",","", regex = False)
    df[col]= df[col].astype(str).str.replace('$','', regex = False)
    df[col] = df[col].astype(float)
    return df

In [6]:
b_cal_1 = to_float(b_cal, 'price')
s_cal_1 = to_float(s_cal, 'price')
b_cal_1 = create_dateparts(b_cal, 'date')
s_cal_1 = create_dateparts(s_cal, 'date')
display_side_by_side(s_cal_1.head(3),b_cal_1.head(3), titles = ['b_cal_1', 's_cal_1'])

Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,241032,2016-01-04,t,85.0,2016,1,4,Monday
1,241032,2016-01-05,t,85.0,2016,1,5,Tuesday
2,241032,2016-01-06,f,,2016,1,6,Wednesday

Unnamed: 0,listing_id,date,available,price,year,month,day,dow
0,12147973,2017-09-05,f,,2017,9,5,Tuesday
1,12147973,2017-09-04,f,,2017,9,4,Monday
2,12147973,2017-09-03,f,,2017,9,3,Sunday


In [7]:
b_period =get_period_df(b_cal_1)
s_period =get_period_df(s_cal_1)
display_side_by_side(b_period.head(10), s_period.head(10), titles= ['b_cal_1', 's_cal_1'])
print("Number of unique Listing IDs in Boston Calendar: ", len(b_cal_1.listing_id.unique()))
print("Number of unique Listing IDs in Seatle Calendar: ", len(s_cal_1.listing_id.unique()))

Unnamed: 0,year,month,day,count
0,2016,9,6,3586
1,2016,9,7,3586
2,2016,9,8,3586
3,2016,9,9,3586
4,2016,9,10,3586
5,2016,9,11,3586
6,2016,9,12,3586
7,2016,9,13,3586
8,2016,9,14,3586
9,2016,9,15,3586

Unnamed: 0,year,month,day,count
0,2016,1,4,3818
1,2016,1,5,3818
2,2016,1,6,3818
3,2016,1,7,3818
4,2016,1,8,3818
5,2016,1,9,3818
6,2016,1,10,3818
7,2016,1,11,3818
8,2016,1,12,3818
9,2016,1,13,3818


Number of unique Listing IDs in Boston Calendar:  3585
Number of unique Listing IDs in Seatle Calendar:  3818


#### Counts are the equivalent to the numbers of unique ids because all the ids are spanning the same time period by day.  Let's check any anomalies

In [8]:
def check_anomalies(df, col):
    list_ids_not_year_long = []
    for i in sorted(list(df[col].unique())):
        if df[df[col]== i].shape[0] != 365:
            list_ids_not_year_long.append(i)
    print("Entry Ids that don't span 1 year: " , list_ids_not_year_long)

In [9]:
#Boston
check_anomalies(b_cal_1, 'listing_id')

Entry Ids that don't span 1 year:  [12898806]


In [10]:
#Seatle
check_anomalies(s_cal_1, 'listing_id')

Entry Ids that don't span 1 year:  []


In [11]:
## check this entry in Boston Calendar
print("Span of the entries for this listing, should be 365: ", b_cal_1[b_cal_1['listing_id']== 12898806].shape[0])
## 2 years, seems like a duplicate as 730 = 365 * 2
one_or_two = pd.DataFrame(b_cal_1[b_cal_1['listing_id']==12898806].groupby(['year', 'month', 'day'])['day'].count()).day.unique()[0]
print("Should be 1: ", one_or_two)
## It indeed is :)
b_cal_1 = b_cal_1.drop_duplicates()
print("Size of anomaly listing, Should be = 365: ", b_cal_1.drop_duplicates()[b_cal_1.drop_duplicates().listing_id==12898806]['listing_id'].size)
print("After removing duplicates, Span of the entries for this listing, should be 365: ", b_cal_1[b_cal_1['listing_id']== 12898806].shape[0])
print("After removing duplicates, shape is: ", b_cal_1.shape)

Span of the entries for this listing, should be 365:  730
Should be 1:  2
Size of anomaly listing, Should be = 365:  365
After removing duplicates, Span of the entries for this listing, should be 365:  365
After removing duplicates, shape is:  (1308525, 8)


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
### Comments:  
[Boston & Seatle Calendar]
- The datasets have information about listing dates, availability and price tracked over a year for ever listing id
- There are no data entry errors, all nulls are due to the structuring of the Data (the listings that aren't available has no price)
<br><br>
- I added 4 cols that contain dateparts that will aid further analysis and modeling
- The Boston calendar Dataset ranges through `365`days from `6th of September'16` to `5th of September'17`, No nulls with `1308525` rows and  `8` cols
- The Seatle calendar Dataset ranges through `365`days from `4th of January'16` to `2nd of January'17`, No nulls with `1393570` rows and  `8` cols
<br><br>
- Number of unique Listing IDs in Boston Calendar:  `3585`
- Number of unique Listing IDs in Seatle Calendar:  `3818`
- May need to order the table later 

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 _______________________________________________________________________________________________________________________

## Step 1: Continue - 

> **Boston & Seatle Listings**

In [12]:
b_list.head(1)
#s_list.head(10)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,


 ### Check the sizes of cols & rows & check Nulls

In [13]:
print_side_by_side("Boston listings size :", b_list.shape, "Seatle listings size :", s_list.shape)

print_side_by_side("Number of Non-null cols in Boston listings: ",  np.sum(b_list.isnull().sum()==0) ,"Number of Non-null cols in Seatle listings: ",  np.sum(s_list.isnull().sum()==0))
set_difference = set(b_list.columns) - set(s_list.columns)
print("Columns in Boston but not in Seatle:  ", set_difference)

# For Boston Listings The null proportion per column
#b_list.isnull().sum()[b_list.isnull().sum()>0]/b_list.shape[0]*100
#b_list_nan = b_list.isnull().sum()/b_list.shape[0]
#b_list_nan[b_list_nan>.75]
#b_list_nan[b_list_nan>.5]
#--------------------------------------------------------------------
# For Seatle Listings The null proportion per column
#s_list.isnull().sum()[s_list.isnull().sum()>0]/s_list.shape[0]*100
#s_list_nan = s_list.isnull().sum()/s_list.shape[0]
#s_list_nan[s_list_nan>.75]
#s_list_nan[s_list_nan>.5]

Boston listings size :  (3585 95)                        Seatle listings size :  (3818 92)
Number of Non-null cols in Boston listings:   51         Number of Non-null cols in Seatle listings:   47
Columns in Boston but not in Seatle:   {'house_rules', 'access', 'interaction'}


### Basic cleaning

In [295]:
############################# To Do :
#(Depending on the questions I am posing at the end of the exploration):
#---------------------------------------------------------------------------------------------------------------------------
#----------------check correlations before modeling to make sure these modifications are useful----------------------------------------------------

#to_datetime
#-----------
# host_since
# first_review
# last_review
#-------------------------------
#to_parts
#---------
#maximum_nights   ++   *  "divide into ranges, week or less, more then a week"
#-------------------------------
#to_count
#amenities     +++             "provided a count of the amenities"        !!!
#host_verifications    +++      "provided a count of the verifications"   !!!
#-------------------------------
#to_len_text 
#--------------
#name                     "provided length of text ""
#host_about               " "provided length of text ""
#summary                  "provided length of text ""
#description               "provided length of text ""                
#neighborhood_overview     "provided length of text ""
#transit -                 "provided length of text ""
#--------------------------------------------------------------------------------------------------------------------------
#to_float
#---------
#cleaning_fee  ++++                     "Öbject, Transform to numeric""      30% null boston,27% seatle
#host_response_rate   +++++             "Öbject, Transform to numeric", 
#host_acceptance_rate  +++++            "Öbject, Transform to numeric", 
#host_response_rate   +++++             "Öbject, Transform to numeric",  
#host_acceptance_rate  +++++            "Öbject, Transform to numeric", 
#extra_people   ++++                    "Öbject, Transform to numeric", 
#price      +++++                       "Öbject, Transform to numeric", 
#--------------------------------------------------------------------------------------------------------------------------
#to_drop
#--------
#reason> little use 
#------------------
# listing_url, scrape_id, last_scraped, experiences_offered, thumbnail_url,xl_picture_url, medium_url,
# host_id, host_url, host_thumbnail_url, host_picture_url, host_total_listings_count, neighbourhood, 
# neighbourhood_group_cleansed, state, country_code, country, latitude, longitude ,
#has_availability, calendar_last_scraped, host_name, picture_url, space

# reason> only in boston
#------------------
# access , interaction, house_rules

#reason>  Nulls, 0 variability or extreme variability 1000+ unique:
#-----------------------------------------------------------------
#square_feet +++                        "Float,                       90% null boston , 97% seatle ___ drop"
#weekly_price +++++   *               "Öbject, Transform to numeric""  75% Null boston, 47% seatle ___ drop
#monthly_price +++++  *               "Öbject, Transform to numeric"  75% Null boston ,  60% seatle___drop
#security_deposit +++                "Öbject, Transform to numeric""    65% Null boston  51 % seatle___drop
#notes *                             "object, short text"               55% null boston, 42% seatle___drop
#jurisdiction_names
#license
#requires_license
#street                              "object, 1200, 1400 unique values"
#----------------------------------------------------------------------------------------------------------------------------
######################## As is now (check later)
#id
#market  +++                        "object, 5 B, (1) S___ ???
#calendar_updated ++                "object, 38 B, 34 S unique, ???
#property_type    ++++              "object, 14 B, 17 S, unique property type apartment, house, etc. "
#host_location     +++,             "object, 171 B, 121 s unique, "
#host_neighbourhood   +++++         "object, 54 B, 103 S unqiue ?????,
#neighbourhood_cleansed ++++        "object, 25 B, 87 S unique,  "
#city   ++++                        "object, 39 B, 7 S unique values"
#zipcode ++++                       "object, 44 B, 29 S unique values,
#smart_location  ++++               "object, 39 B, 7 S unique values '
#cancellation_policy                "object, 4 B, 3 s unique  '
#host_response_time ++++,           "5 unique, object, ordinal"
#room_type         ++++             "3 unique 'Entire home/apt', 'Private room', 'Shared room'"
#bed_type      ++++                 "5 unique, Real Bed', 'Futon', 'Pull-out Sofa', 'Airbed', 'Couch'
#host_has_profile_pic ++                 "object, t or f"
#host_identity_verified                  "object, t or f"
#host_is_superhost     +++++             "object, t or f" 
#is_location_exact ++++                  "object, t or f"
#instant_bookable                        "object, t or f"
#require_guest_profile_picture           "object, t or f"
#require_guest_phone_verification        "object, t or f"
#accommodates      ++++               "int"
#availability_30                      "int"
#availability_60                      "int"
#availability_90                      "int"
#availability_365                     "int"
#number_of_reviews                    "int"
#guests_included +++++                "int"
#minimum_nights +++++                 "int"
#calculated_host_listings_count       "int"
#host_listings_count   +++++          "Float" 
#bathrooms          ++++              "Float"
#bedrooms         ++++                "Float"
#beds                ++++             "Float"
#review_scores_rating                 "float"  ???
#review_scores_accuracy               "float"
#review_scores_cleanliness            "float"
#review_scores_checkin                "float"
#review_scores_communication          "float"
#review_scores_location               "float"
#review_scores_value                  "float"
#reviews_per_month                    "Float"
#--------------------------------------------------------------------------------------------------------------------------

In [305]:
drop_cols = ['listing_url', 'scrape_id', 'last_scraped', 'experiences_offered', 'thumbnail_url','xl_picture_url', 
'medium_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'host_total_listings_count', 
'neighbourhood', 'neighbourhood_group_cleansed','state', 'country_code', 'country', 'latitude', 'longitude', 
'has_availability', 'calendar_last_scraped', 'host_name', 'access', 'interaction','house_rules','square_feet', 
'weekly_price', 'monthly_price', 'security_deposit', 'notes', 'jurisdiction_names', 'license', 'requires_license', 
'street', 'picture_url', 'space']
float_cols = ['cleaning_fee', 'host_response_rate','host_acceptance_rate','host_response_rate','host_acceptance_rate','extra_people','price']
len_text_cols = ['name', 'host_about', 'summary', 'description','neighborhood_overview', 'transit']
count_cols =  ['host_verifications', 'amenities'] 
part_col = ['maximum_nights']
datetime_cols = ['host_since','first_review','last_review']

def to_drop(df, drop_cols):
    """
    INPUT
    df -pandas dataframe
    drop_cols -list of columns to drop
    
    OUTPUT
    df - a dataframe with columns of choice dropped 
    """
    for col in drop_cols:
        if col in list(df.columns):
            df = df.drop(col, axis = 1)
        else:
            continue
    return df
def to_float(df, float_cols):
    """
    INPUT
    df -pandas dataframe
    float_cols -list of columns to transform to float
    
    OUTPUT
    df - a dataframe with columns of choice transformed to float 
    """
    for col in float_cols:
            df[col] = df[col].str.replace('$', "", regex = False)
            df[col] = df[col].str.replace('%', "", regex = False)
            df[col] = df[col].str.replace(',', "", regex = False)
    for col in float_cols:
        df[col] = df[col].astype(float)
    return df
def to_len_text(df, len_text_cols):
    """
    INPUT
    df -pandas dataframe
    len_text_cols- list of columns to return the length of text of their values
    
    OUTPUT
    df - a dataframe with columns of choice transformed to len(values) instead of long text
    """
    df_new = df.copy()
    len_text = []
    new_len_text_cols = [] 

    for col in len_text_cols:
        new_len_text_cols.append("len_"+col)

        for i in df_new[col]:
            #print(col,i)
            try:
                len_text.append(len(i))
            except:
                len_text.append(i)
        #print('\n'*10)   
        df_new = df_new.drop(col, axis = 1)
        len_text_col = pd.Series(len_text)  
        len_text_col = len_text_col.reset_index(drop = True)
        #print(len_text_col)
        df_new['len_'+col]= len_text_col
        len_text = []
        df_new[new_len_text_cols] = df_new[new_len_text_cols].fillna(0)
    return df_new, new_len_text_cols

def to_count(df, count_cols): 
    """
    INPUT
    df -pandas dataframe
    count_cols -list of columns to count the string items within each value
    
    OUTPUT
    df - a dataframe with columns of choice transformed to a count of values  
    """
    def to_apply(val):
        if "{" in val:
            val = val.replace('}', "").replace('{', "").replace("'","" ).replace('"',"" ).replace("''", "").strip().split(',')
        elif "[" in val:
            val = val.replace('[',"" ).replace(']',"" ).replace("'","" ).strip().split(",")
        return len(val)   
    for col in count_cols:
        df['count_'+col]= df[col].apply(to_apply)
    return df
def to_items(df, count_cols): 
    """
    INPUT
    df -pandas dataframe
    count_cols -list of columns to divide the values to clean list of items
    
    OUTPUT
    df - a dataframe with columns of choice cleaned and returns the values as lists
    """
    def to_apply(val):
        if "{" in val:
            val = val.replace('}', "").replace('{', "").replace("'","" ).replace('"',"" ).replace("''", "").lower().split(',')
        elif "[" in val:
            val = val.replace('[',"" ).replace(']',"" ).replace("'","" ).lower().split(",")
        return val  
     
    def to_apply1(val):
        new_val = []
        if val == 'None':
            new_val.append(val)
        for i in list(val):
            if (i != "") and ('translation' not in i.lower()):
                new_val.append(i.strip())
        return new_val
    
    def to_apply2(val):
        if len((val)) == 0:
            return ['none']
        else:
            return list(val)
    
    for col in count_cols:
        df[col]= df[col].apply(to_apply)
        df[col]= df[col].apply(to_apply1)
        df[col]= df[col].apply(to_apply2)
    return df
def to_parts(df, part_col):
    """
    INPUT
    df -pandas dataframe
    part_col -list of columns to divide into "week or less" and "more than a week" depending on values
    
    OUTPUT
    df - a dataframe with columns of choice transformed to ranges of "week or less" and "more than a week"
    """
    def to_apply(val):
        if val <= 7:
            val = 'Week or less'
        else:
            val = 'More than a week'
        return val
    for part in part_col:
        df[part]= df[part].apply(to_apply)
    return df
def to_datetime(df, datetime_cols):
    """
    INPUT
    df -pandas dataframe
    datetime_cols -list of columns to divide transform to datetime
    
    OUTPUT
    df - a dataframe with columns of choice transformed to datetime
    """
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col])
    return df
def applier(df1,df2,drop = True, float_=True, len_text= True, count= True, items = True,parts = True, datetime= True):
    """
    INPUT
    df1,df2 - 2 pandas dataframes
    drop,float_,len_text, count, parts, date_time - Boolean values that corresponds to previosuly defined functions
    OUTPUT
    df - a dataframe tthat has undergone previously defined functions according to the boolean prameters passed
    """
    while drop:
        df1 = to_drop(df1, drop_cols)
        df2 =to_drop(df2, drop_cols)
        break
    while float_:
        df1 =to_float(df1, float_cols)
        df2 =to_float(df2, float_cols)
        break
    while len_text:
        df1, nltc = to_len_text(df1, len_text_cols)
        df2, nltc = to_len_text(df2, len_text_cols)
        break
    while count:
        df1 = to_count(df1, count_cols)
        df2 = to_count(df2, count_cols)
        break
    while items:
        df1 = to_items(df1, count_cols)
        df2 = to_items(df2, count_cols)
        break
    while parts:
        df1 = to_parts(df1, part_col)
        df2 = to_parts(df2, part_col)
        break
    while datetime:
        df1 = to_datetime(df1,datetime_cols)
        df2 = to_datetime(df2,datetime_cols)    
        break
    return df1, df2

##### *As for amenities*

### Step 1: Basic Exploration with minimal cleaning
*To familiarize with the Data and to gather insights to formulate questions*

In [306]:
'nonpef'.strip()

'nonpef'

In [307]:
b_list_1, s_list_1 = applier(b_list, s_list) 

In [308]:
for i in ['lol']:
    print(i)

lol


In [309]:
for i in s_list_1.host_verifications:
    if i == 'none':
        print(i)

In [314]:
def items_counter(df, col):
    all_strings= {}
    
    def to_apply(val): 
        for i in list(val):
                if i in list(all_strings.keys()):
                    print(all_strings)
                    all_strings[i]+=1
                else:
                    all_strings[i]=1     
    df[col].apply(to_apply)
    return  all_strings

In [315]:
# b_amenities_dict = items_counter(b_list_1,'amenities')
# b_amenities_count = pd.Series(b_amenities_dict).reset_index().rename(columns = {'index':'amenity', 0:'count'}).sort_values(by='count', ascending =False).reset_index(drop =True)
# s_amenities_dict = items_counter(s_list_1, 'amenities')
# s_amenities_count = pd.Series(s_amenities_dict).reset_index().rename(columns = {'index':'amenity', 0:'count'}).sort_values(by='count', ascending =False).reset_index(drop =True)
# print("Amenities in Boston not in Seatle: ",np.setdiff1d(np.array(b_amenities_count['amenity'].astype(str)), np.array(s_amenities_count['amenity']),assume_unique=True))
# display_side_by_side(b_amenities_count, s_amenities_count, titles = ['Boston', 'Seatle'])

In [318]:
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

SyntaxError: invalid syntax (<ipython-input-318-d3bfc748244a>, line 1)

In [316]:
b_verifs_dict = items_counter(b_list_1,'host_verifications')
b_verifs_count = pd.Series(b_verifs_dict).reset_index().rename(columns = {'index':'host_verifications', 0:'count'}).sort_values(by='count', ascending =False).reset_index(drop =True)
s_verifs_dict = items_counter(s_list_1, 'host_verifications')
s_verifs_count = pd.Series(s_verifs_dict).reset_index().rename(columns = {'index':'host_verifications', 0:'count'}).sort_values(by='count', ascending =False).reset_index(drop =True)
print("Host Verifications in Boston not in Seatle: ",np.setdiff1d(np.array(b_verifs_count['host_verifications'].astype(str)), np.array(s_verifs_count['host_verifications']),assume_unique=True))
display_side_by_side(b_verifs_count, s_verifs_count, titles = ['Boston', 'Seatle'])

{'email': 1, 'phone': 1, 'facebook': 1, 'reviews': 1}
{'email': 2, 'phone': 1, 'facebook': 1, 'reviews': 1}
{'email': 2, 'phone': 2, 'facebook': 1, 'reviews': 1}
{'email': 2, 'phone': 2, 'facebook': 2, 'reviews': 1, 'linkedin': 1, 'amex': 1}
{'email': 2, 'phone': 2, 'facebook': 2, 'reviews': 2, 'linkedin': 1, 'amex': 1, 'jumio': 1}
{'email': 3, 'phone': 2, 'facebook': 2, 'reviews': 2, 'linkedin': 1, 'amex': 1, 'jumio': 1}
{'email': 3, 'phone': 3, 'facebook': 2, 'reviews': 2, 'linkedin': 1, 'amex': 1, 'jumio': 1}
{'email': 3, 'phone': 3, 'facebook': 2, 'reviews': 3, 'linkedin': 1, 'amex': 1, 'jumio': 1}
{'email': 3, 'phone': 3, 'facebook': 2, 'reviews': 3, 'linkedin': 1, 'amex': 1, 'jumio': 2}
{'email': 4, 'phone': 3, 'facebook': 2, 'reviews': 3, 'linkedin': 1, 'amex': 1, 'jumio': 2}
{'email': 4, 'phone': 4, 'facebook': 2, 'reviews': 3, 'linkedin': 1, 'amex': 1, 'jumio': 2}
{'email': 4, 'phone': 4, 'facebook': 2, 'reviews': 4, 'linkedin': 1, 'amex': 1, 'jumio': 2}
{'email': 5, 'phone': 

{'email': 309, 'phone': 312, 'facebook': 85, 'reviews': 298, 'linkedin': 10, 'amex': 5, 'jumio': 77, 'kba': 176, 'manual_online': 1, 'manual_offline': 2, 'google': 4, 'sent_id': 1}
{'email': 309, 'phone': 312, 'facebook': 85, 'reviews': 298, 'linkedin': 10, 'amex': 5, 'jumio': 78, 'kba': 176, 'manual_online': 1, 'manual_offline': 2, 'google': 4, 'sent_id': 1}
{'email': 310, 'phone': 312, 'facebook': 85, 'reviews': 298, 'linkedin': 10, 'amex': 5, 'jumio': 78, 'kba': 176, 'manual_online': 1, 'manual_offline': 2, 'google': 4, 'sent_id': 1}
{'email': 310, 'phone': 313, 'facebook': 85, 'reviews': 298, 'linkedin': 10, 'amex': 5, 'jumio': 78, 'kba': 176, 'manual_online': 1, 'manual_offline': 2, 'google': 4, 'sent_id': 1}
{'email': 310, 'phone': 313, 'facebook': 86, 'reviews': 298, 'linkedin': 10, 'amex': 5, 'jumio': 78, 'kba': 176, 'manual_online': 1, 'manual_offline': 2, 'google': 4, 'sent_id': 1}
{'email': 310, 'phone': 313, 'facebook': 86, 'reviews': 299, 'linkedin': 10, 'amex': 5, 'jumio'

{'email': 645, 'phone': 655, 'facebook': 157, 'reviews': 611, 'linkedin': 18, 'amex': 7, 'jumio': 221, 'kba': 302, 'manual_online': 12, 'manual_offline': 19, 'google': 21, 'sent_id': 2, 'weibo': 1}
{'email': 645, 'phone': 655, 'facebook': 157, 'reviews': 611, 'linkedin': 18, 'amex': 7, 'jumio': 222, 'kba': 302, 'manual_online': 12, 'manual_offline': 19, 'google': 21, 'sent_id': 2, 'weibo': 1}
{'email': 646, 'phone': 655, 'facebook': 157, 'reviews': 611, 'linkedin': 18, 'amex': 7, 'jumio': 222, 'kba': 302, 'manual_online': 12, 'manual_offline': 19, 'google': 21, 'sent_id': 2, 'weibo': 1}
{'email': 646, 'phone': 656, 'facebook': 157, 'reviews': 611, 'linkedin': 18, 'amex': 7, 'jumio': 222, 'kba': 302, 'manual_online': 12, 'manual_offline': 19, 'google': 21, 'sent_id': 2, 'weibo': 1}
{'email': 646, 'phone': 656, 'facebook': 158, 'reviews': 611, 'linkedin': 18, 'amex': 7, 'jumio': 222, 'kba': 302, 'manual_online': 12, 'manual_offline': 19, 'google': 21, 'sent_id': 2, 'weibo': 1}
{'email': 

{'email': 901, 'phone': 912, 'facebook': 219, 'reviews': 851, 'linkedin': 23, 'amex': 18, 'jumio': 310, 'kba': 393, 'manual_online': 12, 'manual_offline': 25, 'google': 28, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 901, 'phone': 913, 'facebook': 219, 'reviews': 851, 'linkedin': 23, 'amex': 18, 'jumio': 310, 'kba': 393, 'manual_online': 12, 'manual_offline': 25, 'google': 28, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 901, 'phone': 913, 'facebook': 219, 'reviews': 852, 'linkedin': 23, 'amex': 18, 'jumio': 310, 'kba': 393, 'manual_online': 12, 'manual_offline': 25, 'google': 28, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 901, 'phone': 913, 'facebook': 219, 'reviews': 852, 'linkedin': 23, 'amex': 18, 'jumio': 310, 'kba': 394, 'manual_online': 12, 'manual_offline': 25, 'google': 28, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 902, 'phone': 913, 'facebook': 219, 'reviews': 852, 'linkedin': 23, 'amex': 18, 'jumio': 310, 'kba': 394, 'manual_online': 12, 'manual_offline': 25, '

{'email': 1265, 'phone': 1287, 'facebook': 269, 'reviews': 1207, 'linkedin': 28, 'amex': 28, 'jumio': 464, 'kba': 526, 'manual_online': 12, 'manual_offline': 44, 'google': 36, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1265, 'phone': 1288, 'facebook': 269, 'reviews': 1207, 'linkedin': 28, 'amex': 28, 'jumio': 464, 'kba': 526, 'manual_online': 12, 'manual_offline': 44, 'google': 36, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1265, 'phone': 1288, 'facebook': 270, 'reviews': 1207, 'linkedin': 28, 'amex': 28, 'jumio': 464, 'kba': 526, 'manual_online': 12, 'manual_offline': 44, 'google': 36, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1265, 'phone': 1288, 'facebook': 270, 'reviews': 1208, 'linkedin': 28, 'amex': 28, 'jumio': 464, 'kba': 526, 'manual_online': 12, 'manual_offline': 44, 'google': 36, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1265, 'phone': 1288, 'facebook': 270, 'reviews': 1208, 'linkedin': 28, 'amex': 28, 'jumio': 465, 'kba': 526, 'manual_online': 12, 'manual_

{'email': 1508, 'phone': 1536, 'facebook': 300, 'reviews': 1442, 'linkedin': 33, 'amex': 37, 'jumio': 577, 'kba': 608, 'manual_online': 23, 'manual_offline': 59, 'google': 42, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1508, 'phone': 1537, 'facebook': 300, 'reviews': 1442, 'linkedin': 33, 'amex': 37, 'jumio': 577, 'kba': 608, 'manual_online': 23, 'manual_offline': 59, 'google': 42, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1508, 'phone': 1537, 'facebook': 300, 'reviews': 1443, 'linkedin': 33, 'amex': 37, 'jumio': 577, 'kba': 608, 'manual_online': 23, 'manual_offline': 59, 'google': 42, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1508, 'phone': 1537, 'facebook': 300, 'reviews': 1443, 'linkedin': 33, 'amex': 37, 'jumio': 578, 'kba': 608, 'manual_online': 23, 'manual_offline': 59, 'google': 42, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1509, 'phone': 1537, 'facebook': 300, 'reviews': 1443, 'linkedin': 33, 'amex': 37, 'jumio': 578, 'kba': 608, 'manual_online': 23, 'manual_

{'email': 1817, 'phone': 1851, 'facebook': 356, 'reviews': 1738, 'linkedin': 39, 'amex': 41, 'jumio': 702, 'kba': 712, 'manual_online': 32, 'manual_offline': 74, 'google': 50, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1817, 'phone': 1851, 'facebook': 356, 'reviews': 1738, 'linkedin': 39, 'amex': 41, 'jumio': 703, 'kba': 712, 'manual_online': 32, 'manual_offline': 74, 'google': 50, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1818, 'phone': 1851, 'facebook': 356, 'reviews': 1738, 'linkedin': 39, 'amex': 41, 'jumio': 703, 'kba': 712, 'manual_online': 32, 'manual_offline': 74, 'google': 50, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1818, 'phone': 1852, 'facebook': 356, 'reviews': 1738, 'linkedin': 39, 'amex': 41, 'jumio': 703, 'kba': 712, 'manual_online': 32, 'manual_offline': 74, 'google': 50, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 1818, 'phone': 1852, 'facebook': 356, 'reviews': 1739, 'linkedin': 39, 'amex': 41, 'jumio': 703, 'kba': 712, 'manual_online': 32, 'manual_

{'email': 2021, 'phone': 2062, 'facebook': 391, 'reviews': 1935, 'linkedin': 42, 'amex': 51, 'jumio': 795, 'kba': 796, 'manual_online': 32, 'manual_offline': 77, 'google': 61, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 2022, 'phone': 2062, 'facebook': 391, 'reviews': 1935, 'linkedin': 42, 'amex': 51, 'jumio': 795, 'kba': 796, 'manual_online': 32, 'manual_offline': 77, 'google': 61, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 2022, 'phone': 2063, 'facebook': 391, 'reviews': 1935, 'linkedin': 42, 'amex': 51, 'jumio': 795, 'kba': 796, 'manual_online': 32, 'manual_offline': 77, 'google': 61, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 2022, 'phone': 2063, 'facebook': 391, 'reviews': 1936, 'linkedin': 42, 'amex': 51, 'jumio': 795, 'kba': 796, 'manual_online': 32, 'manual_offline': 77, 'google': 61, 'sent_id': 2, 'weibo': 2, 'none': 1}
{'email': 2023, 'phone': 2063, 'facebook': 391, 'reviews': 1936, 'linkedin': 42, 'amex': 51, 'jumio': 795, 'kba': 796, 'manual_online': 32, 'manual_

{'email': 2288, 'phone': 2336, 'facebook': 417, 'reviews': 2182, 'linkedin': 51, 'amex': 60, 'jumio': 932, 'kba': 864, 'manual_online': 49, 'manual_offline': 98, 'google': 70, 'sent_id': 3, 'weibo': 5, 'none': 1}
{'email': 2288, 'phone': 2337, 'facebook': 417, 'reviews': 2182, 'linkedin': 51, 'amex': 60, 'jumio': 932, 'kba': 864, 'manual_online': 49, 'manual_offline': 98, 'google': 70, 'sent_id': 3, 'weibo': 5, 'none': 1}
{'email': 2288, 'phone': 2337, 'facebook': 417, 'reviews': 2183, 'linkedin': 51, 'amex': 60, 'jumio': 932, 'kba': 864, 'manual_online': 49, 'manual_offline': 98, 'google': 70, 'sent_id': 3, 'weibo': 5, 'none': 1}
{'email': 2288, 'phone': 2337, 'facebook': 417, 'reviews': 2183, 'linkedin': 51, 'amex': 60, 'jumio': 933, 'kba': 864, 'manual_online': 49, 'manual_offline': 98, 'google': 70, 'sent_id': 3, 'weibo': 5, 'none': 1}
{'email': 2289, 'phone': 2337, 'facebook': 417, 'reviews': 2183, 'linkedin': 51, 'amex': 60, 'jumio': 933, 'kba': 864, 'manual_online': 49, 'manual_

{'email': 2608, 'phone': 2666, 'facebook': 469, 'reviews': 2477, 'linkedin': 65, 'amex': 71, 'jumio': 1082, 'kba': 966, 'manual_online': 61, 'manual_offline': 114, 'google': 79, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2609, 'phone': 2666, 'facebook': 469, 'reviews': 2477, 'linkedin': 65, 'amex': 71, 'jumio': 1082, 'kba': 966, 'manual_online': 61, 'manual_offline': 114, 'google': 79, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2609, 'phone': 2667, 'facebook': 469, 'reviews': 2477, 'linkedin': 65, 'amex': 71, 'jumio': 1082, 'kba': 966, 'manual_online': 61, 'manual_offline': 114, 'google': 79, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2609, 'phone': 2667, 'facebook': 469, 'reviews': 2478, 'linkedin': 65, 'amex': 71, 'jumio': 1082, 'kba': 966, 'manual_online': 61, 'manual_offline': 114, 'google': 79, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2609, 'phone': 2667, 'facebook': 469, 'reviews': 2478, 'linkedin': 65, 'amex': 71, 'jumio': 1083, 'kba': 966, 'manual_online': 61,

{'email': 2955, 'phone': 3023, 'facebook': 542, 'reviews': 2809, 'linkedin': 69, 'amex': 75, 'jumio': 1191, 'kba': 1110, 'manual_online': 63, 'manual_offline': 117, 'google': 90, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2956, 'phone': 3023, 'facebook': 542, 'reviews': 2809, 'linkedin': 69, 'amex': 75, 'jumio': 1191, 'kba': 1110, 'manual_online': 63, 'manual_offline': 117, 'google': 90, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2956, 'phone': 3024, 'facebook': 542, 'reviews': 2809, 'linkedin': 69, 'amex': 75, 'jumio': 1191, 'kba': 1110, 'manual_online': 63, 'manual_offline': 117, 'google': 90, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2956, 'phone': 3024, 'facebook': 542, 'reviews': 2810, 'linkedin': 69, 'amex': 75, 'jumio': 1191, 'kba': 1110, 'manual_online': 63, 'manual_offline': 117, 'google': 90, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 2957, 'phone': 3024, 'facebook': 542, 'reviews': 2810, 'linkedin': 69, 'amex': 75, 'jumio': 1191, 'kba': 1110, 'manual_online'

{'email': 3104, 'phone': 3174, 'facebook': 562, 'reviews': 2951, 'linkedin': 70, 'amex': 78, 'jumio': 1261, 'kba': 1158, 'manual_online': 73, 'manual_offline': 129, 'google': 97, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 3104, 'phone': 3175, 'facebook': 562, 'reviews': 2951, 'linkedin': 70, 'amex': 78, 'jumio': 1261, 'kba': 1158, 'manual_online': 73, 'manual_offline': 129, 'google': 97, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 3104, 'phone': 3175, 'facebook': 562, 'reviews': 2952, 'linkedin': 70, 'amex': 78, 'jumio': 1261, 'kba': 1158, 'manual_online': 73, 'manual_offline': 129, 'google': 97, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 3104, 'phone': 3175, 'facebook': 562, 'reviews': 2952, 'linkedin': 70, 'amex': 78, 'jumio': 1262, 'kba': 1158, 'manual_online': 73, 'manual_offline': 129, 'google': 97, 'sent_id': 3, 'weibo': 6, 'none': 1}
{'email': 3105, 'phone': 3175, 'facebook': 562, 'reviews': 2952, 'linkedin': 70, 'amex': 78, 'jumio': 1262, 'kba': 1158, 'manual_online'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'email': 1509, 'phone': 1541, 'reviews': 1464, 'kba': 726, 'facebook': 792, 'linkedin': 451, 'jumio': 593, 'google': 294, 'manual_offline': 11, 'amex': 19, 'manual_online': 5, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1509, 'phone': 1542, 'reviews': 1464, 'kba': 726, 'facebook': 792, 'linkedin': 451, 'jumio': 593, 'google': 294, 'manual_offline': 11, 'amex': 19, 'manual_online': 5, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1509, 'phone': 1542, 'reviews': 1465, 'kba': 726, 'facebook': 792, 'linkedin': 451, 'jumio': 593, 'google': 294, 'manual_offline': 11, 'amex': 19, 'manual_online': 5, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1509, 'phone': 1542, 'reviews': 1465, 'kba': 727, 'facebook': 792, 'linkedin': 451, 'jumio': 593, 'google': 294, 'manual_offline': 11, 'amex': 19, 'manual_online': 5, 'sent_id': 4, 'photographer': 1

{'email': 1745, 'phone': 1780, 'reviews': 1684, 'kba': 828, 'facebook': 933, 'linkedin': 508, 'jumio': 685, 'google': 341, 'manual_offline': 14, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1745, 'phone': 1781, 'reviews': 1684, 'kba': 828, 'facebook': 933, 'linkedin': 508, 'jumio': 685, 'google': 341, 'manual_offline': 14, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1745, 'phone': 1781, 'reviews': 1684, 'kba': 828, 'facebook': 934, 'linkedin': 508, 'jumio': 685, 'google': 341, 'manual_offline': 14, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2}
{'email': 1745, 'phone': 1781, 'reviews': 1685, 'kba': 828, 'facebook': 934, 'linkedin': 508, 'jumio': 685, 'google': 341, 'manual_offline': 14, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1

{'email': 1919, 'phone': 1963, 'reviews': 1855, 'kba': 916, 'facebook': 1033, 'linkedin': 544, 'jumio': 751, 'google': 379, 'manual_offline': 16, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 1919, 'phone': 1963, 'reviews': 1855, 'kba': 916, 'facebook': 1033, 'linkedin': 544, 'jumio': 752, 'google': 379, 'manual_offline': 16, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 1920, 'phone': 1963, 'reviews': 1855, 'kba': 916, 'facebook': 1033, 'linkedin': 544, 'jumio': 752, 'google': 379, 'manual_offline': 16, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 1920, 'phone': 1964, 'reviews': 1855, 'kba': 916, 'facebook': 1033, 'linkedin': 544, 'jumio': 752, 'google': 379, 'manual_offline': 16, 'amex': 19, 'manual_onl

{'email': 2244, 'phone': 2292, 'reviews': 2163, 'kba': 1088, 'facebook': 1205, 'linkedin': 591, 'jumio': 833, 'google': 438, 'manual_offline': 20, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2245, 'phone': 2292, 'reviews': 2163, 'kba': 1088, 'facebook': 1205, 'linkedin': 591, 'jumio': 833, 'google': 438, 'manual_offline': 20, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2245, 'phone': 2293, 'reviews': 2163, 'kba': 1088, 'facebook': 1205, 'linkedin': 591, 'jumio': 833, 'google': 438, 'manual_offline': 20, 'amex': 19, 'manual_online': 6, 'sent_id': 4, 'photographer': 1, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2245, 'phone': 2293, 'reviews': 2164, 'kba': 1088, 'facebook': 1205, 'linkedin': 591, 'jumio': 833, 'google': 438, 'manual_offline': 20, 'amex': 19, 'manual

{'email': 2457, 'phone': 2507, 'reviews': 2370, 'kba': 1207, 'facebook': 1294, 'linkedin': 629, 'jumio': 891, 'google': 491, 'manual_offline': 24, 'amex': 19, 'manual_online': 9, 'sent_id': 4, 'photographer': 2, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2457, 'phone': 2507, 'reviews': 2371, 'kba': 1207, 'facebook': 1294, 'linkedin': 629, 'jumio': 891, 'google': 491, 'manual_offline': 24, 'amex': 19, 'manual_online': 9, 'sent_id': 4, 'photographer': 2, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2457, 'phone': 2507, 'reviews': 2371, 'kba': 1207, 'facebook': 1294, 'linkedin': 629, 'jumio': 892, 'google': 491, 'manual_offline': 24, 'amex': 19, 'manual_online': 9, 'sent_id': 4, 'photographer': 2, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2457, 'phone': 2507, 'reviews': 2371, 'kba': 1208, 'facebook': 1294, 'linkedin': 629, 'jumio': 892, 'google': 491, 'manual_offline': 24, 'amex': 19, 'manual

{'email': 2669, 'phone': 2730, 'reviews': 2576, 'kba': 1302, 'facebook': 1412, 'linkedin': 682, 'jumio': 965, 'google': 550, 'manual_offline': 31, 'amex': 22, 'manual_online': 9, 'sent_id': 7, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2670, 'phone': 2730, 'reviews': 2576, 'kba': 1302, 'facebook': 1412, 'linkedin': 682, 'jumio': 965, 'google': 550, 'manual_offline': 31, 'amex': 22, 'manual_online': 9, 'sent_id': 7, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2670, 'phone': 2731, 'reviews': 2576, 'kba': 1302, 'facebook': 1412, 'linkedin': 682, 'jumio': 965, 'google': 550, 'manual_offline': 31, 'amex': 22, 'manual_online': 9, 'sent_id': 7, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 2670, 'phone': 2731, 'reviews': 2576, 'kba': 1302, 'facebook': 1413, 'linkedin': 682, 'jumio': 965, 'google': 550, 'manual_offline': 31, 'amex': 22, 'manual

{'email': 3001, 'phone': 3072, 'reviews': 2894, 'kba': 1461, 'facebook': 1590, 'linkedin': 761, 'jumio': 1079, 'google': 623, 'manual_offline': 47, 'amex': 24, 'manual_online': 11, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3001, 'phone': 3072, 'reviews': 2894, 'kba': 1461, 'facebook': 1591, 'linkedin': 761, 'jumio': 1079, 'google': 623, 'manual_offline': 47, 'amex': 24, 'manual_online': 11, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3001, 'phone': 3072, 'reviews': 2895, 'kba': 1461, 'facebook': 1591, 'linkedin': 761, 'jumio': 1079, 'google': 623, 'manual_offline': 47, 'amex': 24, 'manual_online': 11, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3001, 'phone': 3072, 'reviews': 2895, 'kba': 1461, 'facebook': 1591, 'linkedin': 761, 'jumio': 1079, 'google': 623, 'manual_offline': 48, 'amex': 24, 

{'email': 3235, 'phone': 3311, 'reviews': 3111, 'kba': 1581, 'facebook': 1708, 'linkedin': 801, 'jumio': 1141, 'google': 668, 'manual_offline': 49, 'amex': 24, 'manual_online': 12, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3236, 'phone': 3311, 'reviews': 3111, 'kba': 1581, 'facebook': 1708, 'linkedin': 801, 'jumio': 1141, 'google': 668, 'manual_offline': 49, 'amex': 24, 'manual_online': 12, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3236, 'phone': 3312, 'reviews': 3111, 'kba': 1581, 'facebook': 1708, 'linkedin': 801, 'jumio': 1141, 'google': 668, 'manual_offline': 49, 'amex': 24, 'manual_online': 12, 'sent_id': 9, 'photographer': 4, 'none': 1, 'None': 2, 'N': 2, 'o': 2, 'n': 2, 'e': 2, 'weibo': 5}
{'email': 3236, 'phone': 3312, 'reviews': 3111, 'kba': 1581, 'facebook': 1709, 'linkedin': 801, 'jumio': 1141, 'google': 668, 'manual_offline': 49, 'amex': 24, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [313]:
s_verifs_dict

{'email': 3706,
 'phone': 3791,
 'reviews': 3549,
 'kba': 1816,
 'facebook': 1928,
 'linkedin': 891,
 'jumio': 1289,
 'google': 745,
 'manual_offline': 52,
 'amex': 25,
 'manual_online': 13,
 'sent_id': 9,
 'photographer': 4,
 'none': 1,
 'None': 2,
 'N': 2,
 'o': 2,
 'n': 2,
 'e': 2,
 'weibo': 5}

### Check the nulls again
(Will decide what to do with them after checking reviews datasets, and formulate the questions)

In [139]:
df1= (b_list_1.isnull().sum()[b_list_1.isnull().sum()>0]/b_list_1.shape[0]*100).reset_index().rename(columns ={'index':'col_name',0:'nulls_proportion'})
df2 = (s_list_1.isnull().sum()[s_list_1.isnull().sum()>0]/s_list_1.shape[0]*100).reset_index().rename(columns ={'index':'col_name',0:'nulls_proportion'})
display_side_by_side(df1,df2, titles =['b_list_1_nulls','s_list_1_nulls' ])

Unnamed: 0,col_name,nulls_proportion
0,host_location,0.306834
1,host_response_time,13.138075
2,host_response_rate,13.138075
3,host_acceptance_rate,13.138075
4,host_neighbourhood,9.456067
5,city,0.055788
6,zipcode,1.059972
7,market,0.390516
8,property_type,0.083682
9,bathrooms,0.390516

Unnamed: 0,col_name,nulls_proportion
0,host_since,0.052383
1,host_location,0.209534
2,host_response_time,13.698271
3,host_response_rate,13.698271
4,host_acceptance_rate,20.246202
5,host_is_superhost,0.052383
6,host_neighbourhood,7.857517
7,host_listings_count,0.052383
8,host_has_profile_pic,0.052383
9,host_identity_verified,0.052383


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
### Comments:  
[Boston & Seatle Listings]
- Boston listings size : `3585`, `95`
- Seatle listings size : `3818`, `92`
- Number of Non-null cols in Boston listings:  `51`, around half
- Number of Non-null cols in Seatle listings:  `47`, around half
- Wrote a series of functions that commenced some basic cleaning to ease analysis, with the option to switch off any of them depending on the future requirements of the analyses, some of what was done:
<br><br>
>- columns with overwhelming nulls or little to no forseeable use or that only existed in either of the tables were removed (will be checked again depending on the questions) 
>- Took the charachter length of the values in some of the cols with long text and massive unique values, possibly  the length of some fields maybe correlated with price or rentability.
>- Columns with dates are transformed into Datetime, numerical values that were in text to floats
>- Columns `amenities`and `host_verifications`were taken as counts as I am not very aware with the weights of each item within (will be checked again depending on the questions) 
>- `maximum_nights`column seems to lack some integrity so I divided it to week or less and more than a week as I found the average stayt to be with in a week time.
- This basic exploration wasn't free of question marks such as:
<br><br>
>- What is `review_score_rating`?
>- what to do with `market` as it has 5 values in Boston but only 1 in seatle?
>- Would `calendar_updated`be of any use? Not sure but I decided to leave it and see.

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## Step 1: Continue - 

> **Boston & Seatle Reviews**

In [43]:
#b_rev.head(3)
s_rev.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."


 ### Check the sizes of cols & rows & check Nulls

In [44]:
print_side_by_side("Boston reviews size:", b_rev.shape,"Seatle reviews size:", s_rev.shape)
print_side_by_side("Number of Non-null cols in Boston Reviews:",  np.sum(b_rev.isnull().sum()==0), 
"Number of Non-null cols in Seatle Reviews:",  np.sum(s_rev.isnull().sum()==0))
print_side_by_side("Null cols % in Boston:", (b_rev.isnull().sum()[b_rev.isnull().sum()>0]/b_rev.shape[0]*100).to_string(),
"Null cols % in Seatle:", (s_rev.isnull().sum()[s_rev.isnull().sum()>0]/s_rev.shape[0]*100).to_string())
print_side_by_side("Null cols no. in Boston:",(b_rev.isnull().sum()[b_rev.isnull().sum()>0]).to_string(),
"Null cols no. in Seatle:", (s_rev.isnull().sum()[s_rev.isnull().sum()>0]).to_string())

Boston reviews size:  (68275 6)                          Seatle reviews size:  (84849 6)
Number of Non-null cols in Boston Reviews:  5            Number of Non-null cols in Seatle Reviews:  5
Null cols % in Boston:  comments    0.077627             Null cols % in Seatle:  comments    0.021214
Null cols no. in Boston:  comments    53                 Null cols no. in Seatle:  comments    18


### Transforming text in comments column to numerical score

##### Find all the words in each Dataset

In [48]:
#%%time
# def get_words(df, col):
#     """
#     INPUT
#     df -pandas dataframe
#     col -column of which the values are text 
    
#     OUTPUT
#     df - a dataframe with a single colum of all the words 
#     """
#     all_strings = []
#     for val in df[col]:
#         try:
#             val_strings = [''.join(filter(str.isalnum, i.lower())) for i in val.split() if len(i)>3]
#         except:
#             continue
#         for word in val_strings:
#             if word not in all_strings:
#                 all_strings.append(word)
#         val_strings = []
#     return pd.Series(all_strings).to_frame().reset_index(drop = True).rename(columns = {0:'words'})
# boston_words = get_words(b_rev, 'comments')
# seatle_words = get_words(s_rev, 'comments')
# boston_words.to_csv('boston_words.csv')
# seatle_words.to_csv('seatle_words.csv')
print("Boston words size: ", boston_words.shape[0])
print("Seatle words size: ", seatle_words.shape[0])

Boston words size:  54261
Seatle words size:  50627


##### As the previous function took 4 mins to execute, I commented it out and  passed the resulted words dfs to CSV files that were added to the project instead of running it in the notebook again. 

In [49]:
boston_words = pd.read_csv('boston_words.csv', index_col= 0)
seatle_words = pd.read_csv('seatle_words.csv', index_col= 0)
display_side_by_side(boston_words.head(5), seatle_words.head(5), titles = [ 'Boston', 'Seatle'])

Unnamed: 0,words
0,stay
1,islams
2,place
3,really
4,cool

Unnamed: 0,words
0,cute
1,cozy
2,place
3,perfect
4,location


### Citation:
* Using this resource  https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon I downloaded a list of words with positive and negative connotations used for sentiment analysis
* *Based on the book*:  
> Sentiment Analysis and Opinion Mining (Introduction and Survey), Morgan & Claypool, May 2012.

##### Add a scores column per review in each DataFrame using the previous resource as a reference to evaulate each review

In [65]:
positive_words = pd.read_csv('positive-words.txt', sep = '\t')
negative_words = pd.read_csv('negative-words.txt', sep = '\t')
positive_words = positive_words.iloc[29:,:].reset_index(drop = True).rename(columns = {';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;':'words'})
negative_words = negative_words.iloc[31:,:].reset_index(drop = True).rename(columns = {';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;':'words'})
b_pos = np.intersect1d(np.array(boston_words['words'].astype(str)), np.array(positive_words['words']),assume_unique=True)
b_neg = np.intersect1d(np.array(boston_words['words'].astype(str)), np.array(negative_words['words']),assume_unique=True)
s_pos = np.intersect1d(np.array(seatle_words['words'].astype(str)), np.array(positive_words['words']),assume_unique=True)
s_neg = np.intersect1d(np.array(seatle_words['words'].astype(str)), np.array(negative_words['words']),assume_unique=True)
print_side_by_side('Positive words count: ', positive_words.shape[0]
,'Negative words count: ', negative_words.shape[0])
print_side_by_side("No. of positive words in Boston Reviews: ", len(b_pos)
,"No. of negative words in Boston Reviews: ", len(b_neg))
print_side_by_side("No. of positive words in Seatle Reviews: ", len(s_pos)
,"No. of negative words in Seatle Reviews: ", len(s_neg))
    
# def create_scores(df,col, df_pos_array, df_neg_array):
#     """
#     INPUT
#     df -pandas dataframe
#     col -column with text reviews to be transformed in to positive and negative scores
#     pos_array- array with reference positive words for the passed df
#     neg_array- array with reference negative words for the passed df

#     OUTPUT
#     df - a dataframe with a score column containing positive and negative scores"
#     """
#     def get_score(val):
#         val_strings = [''.join(filter(str.isalnum, i.lower())) for i in str(val).split() if len(i)>3]
#         pos_score = len(np.intersect1d(np.array(val_strings).astype(object), df_pos_array, assume_unique =True))
#         neg_score = len(np.intersect1d(np.array(val_strings).astype(object), df_neg_array, assume_unique =True))
#         return pos_score - neg_score
#     df['score']= df[col].apply(get_score)
#     return df

# b_rev_1 = create_scores(b_rev, 'comments', b_pos, b_neg)
# s_rev_1 = create_scores(s_rev, 'comments', s_pos, s_neg)
# b_rev_1.to_csv('boston_reviews_score.csv')
# s_rev_1.to_csv('seatle_reviews_score.csv')

Positive words count:   2005                             Negative words count:   4781
No. of positive words in Boston Reviews:   1147          No. of negative words in Boston Reviews:   1507
No. of positive words in Seatle Reviews:   1235          No. of negative words in Seatle Reviews:   1556


##### As this function takes a while as well, I write to csv files and read the frame again.

In [51]:
b_rev_score = pd.read_csv('boston_reviews_score.csv', index_col = 0)
s_rev_score = pd.read_csv('seatle_reviews_score.csv', index_col = 0)
sub_b_rev = b_rev_score.iloc[:,[5,6]]
sub_s_rev = s_rev_score.iloc[:,[5,6]]
display_side_by_side(sub_b_rev.head(3), sub_s_rev.head(3), titles= ['Boston Reviews', 'Seatle_reviews'])

Unnamed: 0,comments,score
0,"My stay at islam's place was really cool! Good location, 5min away from subway, then 10min from downtown. The room was nice, all place was clean. Islam managed pretty well our arrival, even if it was last minute ;) i do recommand this place to any airbnb user :)",6
1,Great location for both airport and city - great amenities in the house: Plus Islam was always very helpful even though he was away,2
2,We really enjoyed our stay at Islams house. From the outside the house didn't look so inviting but the inside was very nice! Even though Islam himself was not there everything was prepared for our arrival. The airport T Station is only a 5-10 min walk away. The only little issue was that all the people in the house had to share one bathroom. But it was not really a problem and it worked out fine. We would recommend Islams place for a stay in Boston.,3

Unnamed: 0,comments,score
0,Cute and cozy place. Perfect location to everything!,3
1,"Kelly has a great room in a very central location. \r\nBeautiful building , architecture and a style that we really like. \r\nWe felt guite at home here and wish we had spent more time.\r\nWent for a walk and found Seattle Center with a major food festival in progress. What a treat.\r\nVisited the Space Needle and the Chihuly Glass exhibit. Then Pikes Place Market. WOW. Thanks for a great stay.",5
2,"Very spacious apartment, and in a great neighborhood. This is the kind of apartment I wish I had!\r\n\r\nDidn't really get to meet Kelly until I was on my out, but she was always readily available by phone. \r\n\r\nI believe the only ""issue"" (if you want to call it that) was finding a place to park, but I sincerely doubt its easy to park anywhere in a residential area after 5 pm on a Friday",4


In [66]:
print_side_by_side('Maximum score in Boston : ', b_rev_score.iloc[b_rev_score.score.idxmax()].score
,'Minimum Score in Boston : ', b_rev_score.iloc[b_rev_score.score.idxmin()].score)
print_side_by_side('Maximum Score in Seatle : ', s_rev_score.iloc[s_rev_score.score.idxmax()].score
,'Minimum Score in Seatle : ', s_rev_score.iloc[s_rev_score.score.idxmin()].score)
print_side_by_side('Most common score in Boston: ', b_rev_score['score'].mode().to_string(),
'Most common score in Seatle: ', s_rev_score['score'].mode().to_string())
print_side_by_side('Mean score in Boston: ', round(b_rev_score['score'].mean(),2)
,'Mean score in Seatle: ', round(s_rev_score['score'].mean(),2))
print_side_by_side('Median common score in Boston: ',round( b_rev_score['score'].median(),2),
'Median common score in Seatle: ', s_rev_score['score'].median())
print_side_by_side('Standard deviation of score in Boston: ', round(b_rev_score['score'].std(),2)
,'Standard deviation of score in Seatle: ', round(s_rev_score['score'].std(),2))
# print('Score: ', s_rev_score.iloc[s_rev_score.score.idxmax()].score)
# s_rev_score.iloc[s_rev_score.score.idxmax()].comments

Maximum score in Boston :   33                           Minimum Score in Boston :   -17
Maximum Score in Seatle :   38                           Minimum Score in Seatle :   -16
Most common score in Boston:   0    4                    Most common score in Seatle:   0    4
Mean score in Boston:   4.69                             Mean score in Seatle:   5.51
Median common score in Boston:   4.0                     Median common score in Seatle:   5.0
Standard deviation of score in Boston:   3.28            Standard deviation of score in Seatle:   3.3


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

### Comments:  
[Boston & Seatle Reviews]
- Boston reviews size : (68275, 6)
- Seatle reviews size : (84849, 6)
- Nulls are only in `comments`columns in both Datasets: 
- Null percentage in Boston Reviews:  0.077627%
- Null percentage in Seatle Reviews: 0.021214%
- I added a score column to both tables to reflect positive or negative reviews numerically with the aid of an external resource.

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 _______________________________________________________________________________________________________________________

## Task 2: Data Understanding

### Step 1: Formulating Questions

In [None]:
#DISTRIBUTION OF AVERAGE PRICES PER CITY
#DISTRIBUTION OF REVIEWS PRICES PER CITY 
#(CORRELATION) #SCATTERS

#rentability needs ti be defined 
#AVERAGE STDs OF ALL STDS OF THE UNITS PER CITY AND RELATION WITH REVIEWS???

#SCATTER PLOT NO. OF REVIEWS PER LISTING AND STANDARD DEV OF PRICE (CONROLLED FOR OF STAYS)
NUM,NUM,CAT

#Amenities and price 
#How do prices vary through the year in both cities ? when is the season or off season in both cities ?
#Is there a general upward trend of both new Airbnb listings and total Airbnb visitors to Seattle?

#rentability and amenities, rentability and reviews, rentability and superhost, 
#price and super host

#predict rentability in both cities 
#predict price in both cities 

#lens and rentability
#data from Seattle and Boston AirBNB homes can be used to understand how much AirBNB homes are earning in certain time frames and areas.

In [153]:
b_list_1.columns

Index(['id', 'host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'zipcode', 'market', 'smart_location',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights', 'calendar_updated', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'canc