In [1]:
import pandas as pd

# Read column names

In [2]:
def init_column_names():
    num_cols = pd.read_csv('./data/numeric_columns.csv')
    num_cols = num_cols.iloc[:,1:].columns
    # print num_cols
    
    cat_cols = pd.read_csv('./data/categorical_columns.csv')
    cat_cols = cat_cols.iloc[:,1:].columns
    # print cat_cols
    
    amen_cols = pd.read_csv('./data/amenities_columns.csv')
    amen_cols = amen_cols.iloc[:,1:].columns
    # print amen_cols
    
    return (num_cols, cat_cols, amen_cols)

NCOLS, CCOLS, ACOLS = init_column_names()

In [3]:
def init_col_stats():
    num_stats = pd.read_csv('./data/num_stats.csv', index_col=0)
    cat_stats = pd.read_csv('./data/cat_stats.csv', index_col=0)  
    amen_stats = pd.read_csv('./data/amen_stats.csv', index_col=0)
    return num_stats, cat_stats, amen_stats

NSTATS, CSTATS, ASTATS = init_col_stats()

In [4]:
NSTATS

Unnamed: 0,bathrooms,bedrooms,beds,cleaning_fee,guests_included,host_listings_count,host_acceptance_rate,host_response_rate,host_has_profile_pic,host_identity_verified,days_delta,reviews_per_month
count,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0,127302.0
mean,1.266394,1.326751,1.739439,76.918689,1.693713,4.775494,0.965099,0.961348,0.999057,0.999057,1221.425351,1.805586
std,0.621175,0.869686,1.230982,51.607532,1.318152,17.244524,0.12409,0.11664,0.030688,0.030688,664.665319,2.120879
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
25%,1.0,1.0,1.0,50.0,1.0,1.0,1.0,1.0,1.0,1.0,699.0,0.41
50%,1.0,1.0,1.0,70.0,1.0,1.0,1.0,1.0,1.0,1.0,1193.0,1.11
75%,1.5,2.0,2.0,99.0,2.0,2.0,1.0,1.0,1.0,1.0,1698.0,2.68
max,15.0,15.0,16.0,3000.0,30.0,880.0,1.0,1.0,1.0,1.0,3653.0,223.0


# Read Sample input data

In [5]:
def get_col_value(colname, input_df, stats):
        res = None
        if colname in input_df.columns:
            res = input_df[colname]
        else:
            res = stats.loc['50%', colname]
        return res

def reformulate_input(inp):
    num = pd.DataFrame(columns=NCOLS)
    cat = pd.DataFrame(columns=CCOLS)
    amen = pd.DataFrame(columns=ACOLS)
    
    for c in NCOLS:
        num[c] = get_col_value(c, inp, NSTATS)
    
    num = normalize_numeric(num)
    
    for c in CCOLS:
        cat[c] = get_col_value(c, inp, CSTATS)
    
    for c in ACOLS:
        amen[c] = get_col_value(c, inp, ASTATS)

    return pd.concat([num, cat, amen], axis=1)

def normalize_numeric(df):
    outdf = pd.DataFrame(columns=df.columns)
    
    for c in df.columns:
        mmin = NSTATS.loc['min', c]
        mmax = NSTATS.loc['max', c]
        outdf[c] = (df[c] - 1. * mmin) / (mmax - mmin)
    return outdf



In [6]:
si = pd.read_csv('./data/sample_input.csv', encoding='utf8', dtype=float)
rsi = reformulate_input(si)

In [7]:
rsi

Unnamed: 0,bathrooms,bedrooms,beds,cleaning_fee,guests_included,host_listings_count,host_acceptance_rate,host_response_rate,host_has_profile_pic,host_identity_verified,...,amenities_smartlock,amenities_smokedetector,amenities_smokingallowed,amenities_suitableforevents,amenities_translationmissingenhostingamenity,amenities_tv,amenities_washer,amenities_wheelchairaccessible,amenities_wifi,amenities_wirelessinternet
0,0.1,0.133333,0.125,0.066667,0.033333,0.001136,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Load the pickled model

In [8]:
import vecstack
import xgboost as xgb
import dill as pickle
from sklearn.metrics import mean_squared_error, r2_score

xgb.__version__





'0.72.1'

In [9]:
with open('./data/comb_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open('./data/comb_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)

In [13]:
print tc.predict(ts.transform(rsi))

Transforming...

estimator  0: [en: ElasticNet]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

estimator  1: [lass: Lasso]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

estimator  2: [ridge: Ridge]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

estimator  3: [gb: GradientBoostingRegressor]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

estimator  4: [ab: AdaBoostRegressor]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

estimator  5: [randf: RandomForestRegressor]
    model from fold  0: done
    model from fold  1: done
    ----
    DONE

[92.651825]
