In [23]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
df_clean = pd.read_csv('data/cleanData.csv')

In [24]:
import pickle
with open(f'randomForest.sav', 'rb') as f:
    rf = pickle.load(f)

In [25]:
input_dict = {'date': '2019-11-07',
 'zipcode': '02128',
 'propertyType': 'Apartment',
 'roomType': 'Entire home/apt',
 'accommodates': '2',
 'guests_included': '',
 'extra_people': '',
 'bathrooms': '',
 'bedrooms': '',
 'beds': '',
 'security_deposit': '',
 'cleaning_fee': '',
 'prob_lower': '',
 'bedType': 'Real Bed'}

In [26]:
class createModel():
    def __init__(self, model, df_clean):
        self.model = model
        self.dfAll = self.dataProcessing(df_clean)
        self.testCase = self.testCaseInit()
        self.columns_needed = self.similar_columns_name()

    
    def dataProcessing(self, df):
        df['zipcode'] = df['zipcode'].astype('category')
        to_categorical = ['zipcode', 'property_type', 'room_type', 'bed_type', 'peak_month']
        df = pd.concat([df, pd.get_dummies(df[to_categorical],
                                                    prefix=['zipcode_', 'propertyType_', 'roomType_', 'bedType_', 'peakMonth_'], 
                                                    drop_first=True)], axis = 1) 
        not_features = ['listing_id','date','dayWeek','month','host_since','city_daily', 'year', 'day'] + to_categorical
        df = df.drop(not_features, axis = 1)
        
        np.random.seed(2019)
        negative_index = df[(df.pull_distance == 0) & (df.unavailable == 0)].index
        upsample_index = np.random.choice(negative_index, np.sum(df[df['pull_distance'] == 0]['unavailable'])-len(negative_index))
        df = df.append(df.iloc[upsample_index])
        
        X = df.drop(['unavailable'], axis = 1)
        return X
    
    def similar_columns_name(self):
        self.columns_needed = list(self.dfAll.columns)
        columns_remove = ['price_daily','host_length','pull_distance']
        for i in columns_remove:
            self.columns_needed.remove(i)
        return self.columns_needed
    
    def testCaseInit(self):
        self.testCase = self.dfAll.iloc[[0]]
        self.testCase.loc[:,'bedType__Real Bed'] = 0
        self.testCase.loc[:,'zipcode__2128.0'] = 0
        return self.testCase        
    
    def get_prob(self, price):
        self.testCase.loc[:,'price_daily'] = price
        prob = self.model.predict_proba(self.testCase)[0][-1] # use the model here
        return prob

    def optimization(self, lb, ub, prob_lower):
        self.optima_sol = {'Suggested Price':0, 'Probability of Getting Booked':0, 'Expected Return':0}
        for p in np.arange(lb, ub+1, 1):
            prob = self.get_prob(p)
            if prob >= prob_lower:
                if p* prob >= self.optima_sol['Expected Return']:
                    self.optima_sol = {'Suggested Price': p, 'Probability of Getting Booked': prob, 'Expected Return':p* prob}
        if self.optima_sol['Suggested Price'] == 0: # interpolate where we have little data points
            a = (1-self.get_prob(lb))/lb**2
            p = int(((1-prob_lower)/a)**(1/2))
            self.optima_sol = {'Suggested Price': p, 'Probability of Getting Booked': prob_lower, 'Expected Return':p* prob_lower}
        return self.optima_sol
    
    def whole_process(self, prob_lower = 0):
        ### This Part Can be Improved!!####
        try:
            index_Flag = True
            for i in self.columns_needed:
                index_Flag &= self.dfAll[i] == self.testCase[i].values[0]
            self.lb, self.ub = np.quantile(self.dfAll[index_Flag].price_daily, (0.05, 0.95))
        except:
            try:
                index_Flag = True
                for i in self.columns_needed:
                    if i[:7] != 'zipcode':
                        index_Flag &= self.dfAll[i] == self.testCase[i].values[0]
                self.lb, self.ub = np.quantile(self.dfAll[index_Flag].price_daily, (0.05, 0.95))  
            except:
                self.lb, self.ub = np.quantile(self.dfAll.price_daily, (0.05, 0.95))
        return self.optimization(self.lb, self.ub, prob_lower)
    
    
    def inputProcessing(self, input_dict):
        for k, v in input_dict.items():
            if len(v) != 0:
                if k in ['accommodates', 'guests_included', 'extra_people','bathrooms','bedrooms','beds','security_deposit','cleaning_fee']:
                    self.testCase.loc[:, k] = float(v)
                elif k in ['propertyType','roomType','bedType']:
                    if '{}__{}'.format(k,v) in self.testCase.columns:
                        self.testCase.loc[:, '{}__{}'.format(k,v)] = 1
                elif k == 'zipcode':
                    if 'zipcode__{}.0'.format(v[1:]) in self.testCase.columns:
                        self.testCase.loc[:, 'zipcode__{}.0'.format(v[1:])] = 1
                elif k == 'date':
                    weekday = datetime.strptime(v, '%Y-%m-%d').strftime("%A")
                    month = datetime.strptime(v, '%Y-%m-%d').month

                    self.testCase.loc[:, 'weekend'] = weekday in (['Friday', 'Saturday'])
                    monthResult = self.classify_month(month)            
                    if 'peakMonth__{}'.format(monthResult) in self.testCase.columns:
                        self.testCase.loc[:, 'peakMonth__{}'.format(monthResult)] = 1
        return self.testCase
                        
    def classify_month(self,x):
        if x in [5,6,7,8,9,10]:
            return 'Peak'
        elif x in[3,4,11]:
            return 'Middle'
        else:
            return 'Slack'
        

In [27]:
a = createModel(rf, df_clean)

In [28]:
a.inputProcessing(input_dict) # get input to the testCase

Unnamed: 0,price_daily,accommodates,bathrooms,bedrooms,beds,security_deposit,cleaning_fee,guests_included,extra_people,host_length,...,propertyType__Villa,roomType__Hotel room,roomType__Private room,roomType__Shared room,bedType__Couch,bedType__Futon,bedType__Pull-out Sofa,bedType__Real Bed,peakMonth__Peak,peakMonth__Slack
0,125.0,2.0,1.0,1.0,1,0.0,75.0,2,0.0,3638,...,0,0,0,0,0,0,0,1,0,0


In [29]:
a.whole_process() # Run the Optimization

{'Suggested Price': 125.0,
 'Probability of Getting Booked': 0.2,
 'Expected Return': 25.0}

In [30]:
a.testCaseInit() # Reset the testCase

Unnamed: 0,price_daily,accommodates,bathrooms,bedrooms,beds,security_deposit,cleaning_fee,guests_included,extra_people,host_length,...,propertyType__Villa,roomType__Hotel room,roomType__Private room,roomType__Shared room,bedType__Couch,bedType__Futon,bedType__Pull-out Sofa,bedType__Real Bed,peakMonth__Peak,peakMonth__Slack
0,125.0,2,1.0,1.0,1,0.0,75.0,2,0.0,3638,...,0,0,0,0,0,0,0,0,0,0
