In [53]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import GradientBoostingRegressor

import matplotlib.pyplot as plt

# Pre-Processed dataset 

In [2]:
columns = [
'summary', # text tfidf
'description', # text tfidf
'neighborhood_overview', # text tfidf
'transit', # text tfidf
'host_about', # text tfidf

'host_since', # date, number of months

'host_verifications', # string of terms
'amenities', # string of terms

'host_response_rate', # string, strip the % sign, convert to integer !!!!!!!!!!!!!!!!!!!!!!!
'host_acceptance_rate', # string, strip the % sign, convert to integer !!!!!!!!!!!!!!!!!!!!!!!
'cleaning_fee', # string, strip the % sign, convert to integer !!!!!!!!!!!!!!!!!!!!!!!

'zipcode',  # string, convert to int 
'property_type', # string
'room_type', # string
'accommodates', #int
'bathrooms', # float
'bedrooms', # float, convert to integer
'beds', # float, convert to integer
'bed_type', # string ohe
'guests_included', # int, ohe, anything that is below 1% can go
'minimum_nights', # int, ohe, anything that is below 1% can go
'maximum_nights', # int, ohe, anything that is below 1% can go
'number_of_reviews', # int, ohe, anything that is below 1% can go
'review_scores_rating', # float, do nithing for now
'review_scores_accuracy', #float, convert to integer, ohe
'review_scores_cleanliness', #float, convert to integer, ohe
'review_scores_checkin', #float, convert to integer, ohe
'review_scores_communication', #float, convert to integer, ohe
'review_scores_location', #float, convert to integer, ohe
'review_scores_value', #float, convert to integer, ohe
'instant_bookable', # str t/f, bianry encoding
'cancellation_policy', # str ,4 categories, ohe
'calculated_host_listings_count', #  int, ohe
'reviews_per_month',# float, maybe ohe, there are like 500+ unique values

'price' # target variable
]

data = pd.read_csv('data/Train.csv',usecols=columns)

# TFIDF transfomrmation

In [34]:
def tfidf_columns(df,columns):
    
    list_ = []
    new_df = pd.DataFrame(index=range(df.shape[0]))

    for column in columns:
        tfidf =TfidfVectorizer(min_df=40, stop_words='english', lowercase=True, max_df=500)
        df[column].fillna('none',inplace=True)
        processed_text = tfidf.fit_transform(df[column])
        processed_text = pd.DataFrame(processed_text.toarray(), columns=tfidf.get_feature_names())
        processed_text.columns = [column+'_'+i for i in processed_text.columns]
        
        list_.append(processed_text)
    
    df = pd.concat(list_,axis=1)
    return df

text_columns = [
'summary', # text
'description', # text
'neighborhood_overview', # text
'transit', # text
'host_about'] # text]

# Number of months listed on platform

In [4]:
def host_since(df):

    date_df = df[['host_since']]
    date_df['host_since'] = pd.to_datetime(date_df['host_since'])
    date_df['now'] =  pd.Timestamp('2020-07-22')
    date_df = pd.DataFrame((date_df['now'] - date_df['host_since'])/np.timedelta64(1, 'M'), columns=['months_active'])
    date_df['months_active'] = date_df['months_active'].astype('int')
    return date_df

# Amenities 

In [5]:
def amenities(df, column):
    df[column] = df[column].str.replace(r'{|"|}|/', '')
    df[column] = df[column].str.replace(' ', '')
    vector = CountVectorizer()
    series = vector.fit_transform(df[column])
    df = pd.DataFrame(series.toarray(), columns=vector.get_feature_names())
    return df

# OHE

In [6]:
def ohe(df,column,prefix,rare=False):
    
    df[column] = df[column].str.replace(" ","")
    df[column] = df[column].str.replace(r'-|&|/', '')
    
    most_frequent_value = df[column].value_counts().idxmax()
    df[column].fillna(most_frequent_value,inplace=True)
    vectorizer = CountVectorizer()
    
    if rare==False:
        series = vectorizer.fit_transform(df[column])
        df = pd.DataFrame(series.toarray(),columns=vectorizer.get_feature_names())
        df.columns = [prefix+i for i in df.columns]

    elif rare==True:
        series = df[column].mask(df[column].map(df[column].value_counts(normalize=True)) < 0.01, 'other')
        df = pd.DataFrame(series,columns=[column])
        series = vectorizer.fit_transform(df[column])
        df = pd.DataFrame(series.toarray(),columns=vectorizer.get_feature_names())
        df.columns = [prefix+i for i in df.columns]

    return df

# NaN replacer

In [7]:
def replace_nan(df, column):
    most_frequent_value = df[column].value_counts().idxmax()
    df[column].fillna(most_frequent_value,inplace=True)
    df = df[[column]]
    return df
beds = replace_nan(data, 'beds')

# PCT(%)

In [8]:
def remove_pct(df, column):
    df = pd.DataFrame(df[column].str.replace("%",""),columns=[column])
    most_frequent_value = df[column].value_counts().idxmax()
    df[column].fillna(most_frequent_value,inplace=True)
    df[column] = df[column].astype(int)
    return df

# Cleaning_fee

In [9]:
def cleaning_fee(df, column):
    df = pd.DataFrame(df[column].str.replace("$",""),columns=[column])
    df.fillna(0,inplace=True)
    df[column] = df[column].astype(float)
    return df

# Target_preprocessing

In [10]:
def pre_proc_price(df,column):
    series = df[column].str.replace(r'$','')
    series = series.str.replace(r',','')
    df = pd.DataFrame(series,columns=[column])
    df['price'] = pd.to_numeric(df['price'])
    return df 

# Packaging it all together

In [43]:
funcs = [tfidf_columns(data,text_columns),
host_since(data),
amenities(data, 'amenities'),
ohe(data,'zipcode','zip_',False),
ohe(data,'property_type', 'pr_tp_', rare=True),
ohe(data,'room_type','rm_tp_',False),
data[['accommodates']],
replace_nan(data, 'bathrooms'),
replace_nan(data, 'beds'),
ohe(data,'bed_type','bd_tp_',True),
data[['guests_included']],
data[['minimum_nights']],
data[['maximum_nights']],
data[['number_of_reviews']],
replace_nan(data, 'review_scores_rating'),
replace_nan(data, 'review_scores_accuracy'),
replace_nan(data, 'review_scores_cleanliness'),
replace_nan(data, 'review_scores_checkin'),
replace_nan(data, 'review_scores_communication'),
replace_nan(data, 'review_scores_location'),
replace_nan(data, 'review_scores_value'),
data['instant_bookable'].replace({'f':0,'t':1}),
ohe(data,'cancellation_policy','canc_pol_',False),
data[['calculated_host_listings_count']],
replace_nan(data,'reviews_per_month'),
remove_pct(data, 'host_response_rate'),
remove_pct(data, 'host_acceptance_rate'),
cleaning_fee(data, 'cleaning_fee'),
pre_proc_price(data,'price')]

In [44]:
dataframes = []
for function in funcs:
    dataframes.append(function)
ml_df = pd.concat(dataframes,axis=1)  

In [49]:
X = ml_df[ml_df.columns[:-1]]
y = ml_df[['price']]

In [54]:
XGB = GradientBoostingRegressor()

In [55]:
XGB.fit(X,y)

GradientBoostingRegressor()

In [58]:
XGB.score(X,y)

0.8748988684030226

In [66]:
x = pd.DataFrame(XGB.feature_importances_,X.columns,columns=['features'])

In [69]:
x.sort_values(by='features',ascending=False)

Unnamed: 0,features
bathrooms,0.198337
accommodates,0.152263
cleaning_fee,0.054487
description_luxury,0.053014
description_months,0.032770
...,...
description_professionals,0.000000
description_professional,0.000000
description_privacy,0.000000
description_prime,0.000000
