In [462]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
pd.options.display.max_rows = 999

In [463]:
df = pd.read_csv('listings_clean.csv', dtype={'zip': 'str'})

## Zipcodes

In [464]:
def clean_zips(df):
    """A helper function to clean zip code columns for Los Angeles Airbnb Data"""
    
    def first_ele(df):
        for i in df:
            return i
    
    def first_five(df):
        return df[:5]
    
    # Strips entries containing 'CA'
    df = df.str.strip('CA')
    
    # Splits data for entries containing period
    df = df.str.split('.')
    
    # Takes the split data and returns the data before the period
    df = df.map(first_ele)
    
    # Returns only the first five characters
    df = df.map(first_five)
    
    return df
    
    

In [465]:
df['zipcode'] = clean_zips(df['zipcode'])

In [466]:
df.head()

Unnamed: 0,id,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,total_price,price_log
0,20862235,90402,Apartment,Entire home/apt,2,1.0,0.0,2.0,Real Bed,"{TV,Wifi,Kitchen,""Pets allowed"",""Free street p...",225.0,5.4161
1,26165453,90291,Cottage,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Free par...",270.0,5.598422
2,23371066,90403,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Wifi,Kitchen,""Free street parking"",""Indoor...",120.0,4.787492
3,29922373,90272,Villa,Entire home/apt,8,6.0,5.0,5.0,Real Bed,"{TV,Wifi,""Air conditioning"",Pool,Kitchen,""Free...",5000.0,8.517193
4,20541717,90401,Apartment,Entire home/apt,2,1.0,1.0,0.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",185.0,5.220356


## Property Types

Apartment Category
- Serviced Apartment

House Category
- Bungalow

Condominium <br>
Guesthouse <br>
Guest suite <br>
Townhouse <br>
Villa <br>
Loft

Other Category
- Train
- Cave
- Minsu (Taiwan)
- Island
- Treehouse
- Tipi
- Barn
- Dome house
- Dorm
- Other
- Casa particular (Cuba)
- Houseboat
- Bus
- Hut
- Resort
- Campsite
- Chalet
- Castle
- Earth house
- Yurt
- Tent
- Boat
- Aparthotel
- Farm stay
- Hotel
- Tiny house
- Cabin
- Boutique hotel
- Campter/RV
- Bed and breakfast
- Cottage
- 



In [305]:
df['property_type'].value_counts()

Apartment                 16101
House                     14667
Condominium                2483
Guesthouse                 2276
Guest suite                1408
Townhouse                  1366
Bungalow                   1223
Loft                        985
Villa                       903
Serviced apartment          366
Hostel                      319
Cottage                     190
Bed and breakfast           189
Camper/RV                   176
Other                       139
Boutique hotel              131
Cabin                        80
Tiny house                   68
Hotel                        44
Farm stay                    35
Aparthotel                   35
Boat                         30
Tent                         25
Yurt                         17
Earth house                  16
Castle                       15
Treehouse                    14
Chalet                       12
Tipi                         10
Barn                          6
Campsite                      6
Dorm    

## Transforming Data 

In [467]:
df.columns

Index(['id', 'zipcode', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'total_price',
       'price_log'],
      dtype='object')

In [307]:
# One-Hot-Encoding for Categorical Variables

zip_dummies = pd.get_dummies(df['zipcode'])
property_dummies = pd.get_dummies(df['property_type'])
room_dummies = pd.get_dummies(df['room_type'])
bed_dummies = pd.get_dummies(df['bed_type'])

df = df.drop(['zipcode', 'property_type', 'room_type', 'bed_type'], axis=1)
df = pd.concat((df, zip_dummies, property_dummies, room_dummies, bed_dummies), axis=1)

In [308]:
# Scaling for numerical variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

test_df = df[['accommodates', 'bathrooms', 'bedrooms', 'beds']]
df = df.drop(['accommodates', 'bathrooms', 'bedrooms', 'beds'], axis=1)

In [309]:
scaler.fit(test_df)
test_df = scaler.transform(test_df)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [311]:
test_df = pd.DataFrame(test_df, columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds'])
df = pd.concat((df, test_df), axis=1)

In [314]:
df.head()

Unnamed: 0,id,amenities,total_price,price_log,9176,0,10019,10023,139 S,37738,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,accommodates,bathrooms,bedrooms,beds
0,20862235,"{TV,Wifi,Kitchen,""Pets allowed"",""Free street p...",225.0,5.4161,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.631105,-0.475643,-1.301959,0.026511
1,26165453,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Free par...",270.0,5.598422,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.631105,-0.475643,-0.382544,-0.584999
2,23371066,"{TV,Wifi,Kitchen,""Free street parking"",""Indoor...",120.0,4.787492,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.631105,-0.475643,-0.382544,-0.584999
3,29922373,"{TV,Wifi,""Air conditioning"",Pool,Kitchen,""Free...",5000.0,8.517193,0,0,0,0,0,0,...,0,0,0,0,0,1,1.650944,4.883494,3.295118,1.86104
4,20541717,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",185.0,5.220356,0,0,0,0,0,0,...,0,0,0,0,0,1,-0.631105,-0.475643,-0.382544,-1.196509


In [315]:
df.to_csv('../model_ready.csv', index=False)

In [326]:
df.head()

Unnamed: 0,id,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,total_price,price_log
0,20862235,90402,Apartment,Entire home/apt,2,1.0,0.0,2.0,Real Bed,"{TV,Wifi,Kitchen,""Pets allowed"",""Free street p...",225.0,5.4161
1,26165453,90291,Cottage,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Free par...",270.0,5.598422
2,23371066,90403,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Wifi,Kitchen,""Free street parking"",""Indoor...",120.0,4.787492
3,29922373,90272,Villa,Entire home/apt,8,6.0,5.0,5.0,Real Bed,"{TV,Wifi,""Air conditioning"",Pool,Kitchen,""Free...",5000.0,8.517193
4,20541717,90401,Apartment,Entire home/apt,2,1.0,1.0,0.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",185.0,5.220356


## Pipeline Test

In [330]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.linear_model import RidgeCV

In [468]:
df = df.drop(['amenities', 'id'], axis=1)

In [469]:
df.to_csv('../flask_ready.csv', index=False)

## Start Here after inputting "flask_ready.csv"

In [384]:
# Split data into X and y

X = df.drop(['total_price', 'price_log'], axis=1)
y_log = df['price_log']
y = df['total_price']

In [385]:
num_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds']
cat_cols = ['zipcode', 'property_type', 'room_type','bed_type']

In [386]:
df_num = df[num_cols]
df_cat = df[cat_cols]

In [387]:
cat_preprocessor = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True) 
)

num_preprocessor = make_pipeline(
    StandardScaler()
)

In [388]:
cat_transformed = cat_preprocessor.fit_transform(df_cat)

In [389]:
num_transformed = num_preprocessor.fit_transform(df_num)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [390]:
num_transformed = pd.DataFrame(num_transformed, columns=['accommodates', 'bathrooms', 'bedrooms', 'beds'])

In [392]:
cat_transformed.shape, num_transformed.shape

((43365, 349), (43365, 4))

In [393]:
df = pd.concat((num_transformed, cat_transformed), axis=1)

In [394]:
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,zipcode_90402,zipcode_90291,zipcode_90403,zipcode_90272,zipcode_90401,zipcode_90025,...,property_type_Hut,property_type_Minsu (Taiwan),room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Real Bed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Airbed
0,-0.631105,-0.475643,-1.301959,0.026511,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,-0.631105,-0.475643,-0.382544,-0.584999,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,-0.631105,-0.475643,-0.382544,-0.584999,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,1.650944,4.883494,3.295118,1.86104,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,-0.631105,-0.475643,-0.382544,-1.196509,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [417]:
cols = df.columns

In [None]:
# Split data into X and y

X = df.drop(['total_price', 'price_log'], axis=1)
y_log = df['price_log']
y = df['total_price']

In [396]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y_log, test_size=0.35)

In [397]:
reg_params = 10.**np.linspace(-10, 5, 10)
model = RidgeCV(alphas=reg_params, fit_intercept=True, cv=5)
model.fit(X_train, y_train)

RidgeCV(alphas=array([1.00000e-10, 4.64159e-09, 2.15443e-07, 1.00000e-05, 4.64159e-04,
       2.15443e-02, 1.00000e+00, 4.64159e+01, 2.15443e+03, 1.00000e+05]),
    cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

## Data Transform Function

In [482]:
def predict(data):
    
    # Create dataframe from JSON dict
    data = pd.DataFrame.from_dict([data])
    
    num_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds']
    cat_cols = ['zipcode', 'property_type', 'room_type','bed_type']
    
    # Seperate into Numeric and Categorical columns
    df_num = data[num_cols]
    df_cat = data[cat_cols]
     
    # Use train data preprocessor
    cat_transformed = cat_preprocessor.transform(df_cat)
    
    # Use train data preprocessor
    num_transformed = num_preprocessor.transform(df_num)
    num_transformed = pd.DataFrame(num_transformed, columns=num_cols)
    
    # Concatenate numeric and categorical dataframes
    df_transformed = pd.concat((num_transformed, cat_transformed), axis=1)
    # Create blank dataframe using columns from transformed train data
    df_blank = pd.DataFrame(columns=cols)
    
    # Concatenate  
    df_model = pd.concat((df_blank, df_transformed))
    df_model = df_model.replace(np.nan, 0)
    
    y_pred = model.predict(df_model)
    prediction = math.exp(y_pred[0])
      
    return prediction

## Test Data Entry

In [521]:
test_data = {"zipcode" : "90402",
             "property_type": "Apartment",
             "room_type" : "Entire home/apt",
             "accommodates": 5,
             "bathrooms": 4.0,
             "bedrooms": 4.0,
             "beds": 4.0,
             "bed_type": "Real Bed"}

In [522]:
predict(test_data)

  Xt = transform.transform(Xt)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




592.6608919481653