In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.tools.eval_measures as ste
import sklearn.linear_model as slm

full_data = pd.read_csv('airbnb-seattle-listings-train.csv.bz2', sep = '\t')

full_data = full_data[[ 'id', 
              'host_since',
              'host_response_time', #suggest to factor into four levels
              'host_is_superhost', 
              'host_total_listings_count',
              'latitude',
              'longitude',
              'is_location_exact', 
              'room_type', #seems relevant if it could be reduced
              'accommodates',
              'bathrooms',
              'bedrooms',
              'beds',
              'price',
              'security_deposit',
              'cleaning_fee',
              'guests_included',
              'extra_people',
              'minimum_nights',
              'maximum_nights',
              'number_of_reviews',
              'number_of_reviews_ltm', # not sure how this is different. What does ltm mean?
              'review_scores_rating',
              'review_scores_accuracy',
              'review_scores_cleanliness',
              'review_scores_checkin',
              'review_scores_communication',
              'review_scores_location',
              'review_scores_value',
              'instant_bookable',
              'cancellation_policy', 
              'calculated_host_listings_count',
              'reviews_per_month']]

In [2]:
#regex to clean integer data
full_data.price = full_data.price.replace('[\$,]', '', regex=True).astype(float)
full_data.security_deposit = full_data.security_deposit.replace('[\$,]', '', regex=True).astype(float)
full_data.cleaning_fee = full_data.cleaning_fee.replace('[\$,]', '', regex=True).astype(float)
full_data.extra_people = full_data.extra_people.replace('[\$,]', '', regex=True).astype(float)

#converts boolean values into 0,1
full_data['host_is_superhost'] = (full_data.host_is_superhost == 't').astype('int')
full_data['instant_bookable'] = (full_data.instant_bookable == 't').astype('int')

In [3]:
#function and code to convert ordinal cancellation policy string data to integer

def cancelpol (x):
    if x == 'flexible':
        r = 0
    elif x == 'moderate':
        r = 1
    elif x == 'strict':
        r = 2
    elif x == 'strict_14_with_grace_period':
       r = 3
    elif x == 'super_strict_30':
       r = 3
    elif x == 'super_strict_60':
       r = 3
    else:
       r = np.nan
    return(r)

full_data['cancellation_policy'] = full_data.cancellation_policy.apply(lambda x: cancelpol(x))

In [4]:
#helper function and code to convert ordinal response time string data to integer

def responsetime (x):
    if x == 'within an hour':
        r = 0
    elif x == 'within a few hours':
        r = 1
    elif x == 'within a day':
        r = 2
    #else:
        #r = 3
    elif x == 'a few days or more':
       r = 3
    else:
       r = np.nan
    return(r)

full_data['host_response_time'] = full_data.host_response_time.apply(lambda x: responsetime(x))