## Some useful packages, self-buildfunctions adn plotly functions

In [1]:
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings

In [2]:
from collections import Counter
from scipy.stats.stats import pearsonr
from string import ascii_letters

In [3]:
# 画图
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
init_notebook_mode(connected=True)

In [4]:
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

In [5]:
%matplotlib inline

### Import files

In [9]:
csvs = glob.glob('*.csv')

In [10]:
csvs

['reviews.csv', 'listings.csv', 'calendar.csv']

In [11]:
base = pd.read_csv(csvs[1])
listings_df = base.copy()
listings_df.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,,,https://a1.muscache.com/ac/pictures/67560560/c...,,956883,https://www.airbnb.com/users/show/956883,Maija,2011-08-11,"Seattle, Washington, United States","I am an artist, interior designer, and run a s...",within a few hours,96%,100%,f,https://a0.muscache.com/ac/users/956883/profil...,https://a0.muscache.com/ac/users/956883/profil...,Queen Anne,3.0,3.0,"['email', 'phone', 'reviews', 'kba']",t,t,"Gilman Dr W, Seattle, WA 98119, United States",Queen Anne,West Queen Anne,Queen Anne,Seattle,WA,98119,Seattle,"Seattle, WA",US,United States,47.636289,-122.371025,t,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",,$85.00,,,,,2,$5.00,1,365,4 weeks ago,t,14,41,71,346,2016-01-04,207,2011-11-01,2016-01-02,95.0,10.0,10.0,10.0,10.0,9.0,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",https://a0.muscache.com/ac/pictures/14409893/f...,https://a0.muscache.com/im/pictures/14409893/f...,https://a0.muscache.com/ac/pictures/14409893/f...,https://a0.muscache.com/ac/pictures/14409893/f...,5177328,https://www.airbnb.com/users/show/5177328,Andrea,2013-02-21,"Seattle, Washington, United States",Living east coast/left coast/overseas. Time i...,within an hour,98%,100%,t,https://a0.muscache.com/ac/users/5177328/profi...,https://a0.muscache.com/ac/users/5177328/profi...,Queen Anne,6.0,6.0,"['email', 'phone', 'facebook', 'linkedin', 're...",t,t,"7th Avenue West, Seattle, WA 98119, United States",Queen Anne,West Queen Anne,Queen Anne,Seattle,WA,98119,Seattle,"Seattle, WA",US,United States,47.639123,-122.365666,t,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",,$150.00,"$1,000.00","$3,000.00",$100.00,$40.00,1,$0.00,2,90,today,t,13,13,16,291,2016-01-04,43,2013-08-19,2015-12-29,96.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48


### Create a new metrics
    -  new_score_reviews: a feature of (reviews_per_month * review_scores_rating)/10
    -  as an approach to know which listings have more completed orders than someone else and great review from guests as well

In [13]:
listings_df['new_score_reviews2'] = listings_df['reviews_per_month'] * listings_df['review_scores_rating'] / 10
listings_df['new_score_reviews2'].fillna(0, inplace=True)

#### Decide the threshold of Top Performer and Low Performer listings

In [14]:
top90flag = listings_df['new_score_reviews2'].quantile(0.9)
upto25flag = listings_df['new_score_reviews2'].quantile(0.25)

In [15]:
listings_df['top90'] = listings_df.new_score_reviews2 >= top90flag
listings_df['upto25'] = listings_df.new_score_reviews2 <= upto25flag

### Visualize the defined class

In [18]:
def rangeScore(x):
    '''
    Set the bins for the score-range.
    '''
    value = ''
    if (x>= 0 and x < 10):
        value = '0-10'
    elif (x>= 10 and x < 20):
        value = '10-20'
    elif (x>= 20 and x < 30):
        value = '20-30'
    elif (x>= 30.0 and x < 40.0):
        value = '30-40'
    elif (x>= 40 and x < 50):
        value = '40-50'
    elif (x>= 50 and x < 60):
        value = '50-60'
    elif (x>= 60 and x < 70):
        value = '60-70'        
    elif (x>= 70 and x < 80):
        value = '70-80'
    elif (x>= 80 and x < 90):
        value = '80-90'
    elif (x>= 90 and x < 100):
        value = '90-100'
    elif x>= 100:
        value = '100+'
        
    return value

In [19]:
listings_df['score_ranges'] = listings_df['new_score_reviews2'].apply(rangeScore)

In [21]:
listings_df['score_ranges'].head()

0    30-40
1    10-20
2    10-20
3     0-10
4     0-10
Name: score_ranges, dtype: object

In [36]:
# table coloring purpose
top90 = listings_df.groupby('score_ranges', as_index=False)['top90'].max(key='count').rename(columns={'score_ranges':'Score'})
upto25 = listings_df.groupby('score_ranges', as_index=False)['upto25'].max(key='count').rename(columns={'score_ranges':'Score'})

In [None]:
listings_df.groupby('score_ranges', as_index=False)

In [40]:
listings_df.groupby('score_ranges', as_index=False)['top90'].max()

Unnamed: 0,score_ranges,top90
0,0-10,False
1,10-20,False
2,100+,True
3,20-30,False
4,30-40,False
5,40-50,True
6,50-60,True
7,60-70,True
8,70-80,True
9,80-90,True


In [42]:
listings_df.groupby('score_ranges', as_index=False)['top90'].max(key='count')

Unnamed: 0,score_ranges,top90
0,0-10,False
1,10-20,False
2,100+,True
3,20-30,False
4,30-40,False
5,40-50,True
6,50-60,True
7,60-70,True
8,70-80,True
9,80-90,True


In [None]:
def correction(x):
    '''
    Columns value corrections
    '''
    if type(x) == str:
        x = x.replace('$', '')
        x = x.replace(',', '')
        x=float(x)
    
    return (x)

In [None]:
def correction2(x):
    '''
    Columns value corrections
    '''
    if type(x) == str:
        x = x.replace('%', '')
        x = x/100
    return (x)

In [None]:
def to_int(x):
    '''
    Columns value corrections
    '''
    if x == 'f':
        x = x.replace('f', '0')
    
    elif x == 't':
        x = x.replace('t', '1')
    
    else:
        x = '0'
    
    return int(x)

In [None]:
def changeTime(x):
    '''
    change host_response_time column from str to numerical
    '''
    if x == 'within an hour':
        x = '1'
    elif x == 'within a few hours':
        x = '4'
    elif x == 'within a day':
        x = '24'
    elif x == 'a few days or more':
        x = '48'
    else:
        x = '96'
    
    return x

In [None]:
def changeStr(x):
    '''
    change back the host_response_time from numerical to str
    '''
    if x == 1:
        x = 'within an hour'
    elif x == 4:
        x = 'within a few hours'
    elif x == 24:
        x = 'within a day'
    elif x == 48:
        x = 'a few days or more'
    elif x == 96:
        x = 'Not Response'
    
    return x

In [None]:
def createAmenities(x):
    '''
    Convert the Amenities column into more analytical words
    '''
    val = x.replace('{','').replace('}','').replace('"','').replace(' ','_').replace(',',' ')
    val = val.split()
    
    return val