In [117]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [118]:
df = pd.read_json('./typedData/compsTypedInteraction.json')

In [119]:
df.drop(['lux_per_dollar', 'listing_id'], axis=1, inplace=True)

In [120]:
data = df.drop('interestVal', axis=1).select_dtypes(exclude=['object'])

In [121]:
equation = ('+').join(data.columns)

In [122]:
model = smf.ols('interestVal~'+equation, data=df).fit()

In [123]:
sig_features = pd.DataFrame(model.pvalues, index=data.columns, columns={'P_Value'})

In [124]:
sigCols = sig_features[sig_features['P_Value']<.4].index.values
len(sigCols)

18

In [125]:
sigCols = np.append(sigCols, 'interest_level')

In [126]:
ideal = ['Hardwood_Floors', 'High_Speed_Internet', 'Laundry_in_Unit', 'No_Fee', 'avg_num_features',
            'avg_num_luxury', 'avg_outdoor_score', 'avg_price_per_feature', 'avg_price_per_num_lux', 'bathrooms', 
            'bedrooms', 'condominium', 'interest_level', 'laundry_in_building', 'loft', 
            'lux_ratio', 'lux_score', 'numPhotos', 'ph', 'pets_allowed', 'price_feature_ratio', 'townhome',
             'walk_up', 'prob_interest_manager', 'prob_interest_building', 'rooms', 'price_per_room', 'prob_buildManager'
            ]

In [127]:
diff = [x for x in sigCols if x not in ideal]

In [128]:
diff

['Balcony',
 'Common_Outdoor_Space',
 'Dishwasher',
 'Elevator',
 'Garden_Patio',
 'apartment',
 'other',
 'price',
 'price_lux_ratio',
 'price_per_num_lux']

In [129]:
simple = df[sigCols]

In [130]:
from sklearn.linear_model import LogisticRegressionCV as logCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [131]:
X_train, X_test, y_train, y_test = train_test_split(simple.drop('interest_level',axis=1),
                                                    simple['interest_level'], test_size=0.33, random_state=42)

In [132]:
logReg = logCV(cv=10)
logReg.fit(X_train, y_train)
preds = logReg.predict(X_test)
print(classification_report(preds, y_test))

             precision    recall  f1-score   support

       high       0.81      0.80      0.81      1123
        low       0.98      0.92      0.95     10147
     medium       0.69      0.85      0.76      2563

avg / total       0.91      0.90      0.90     13833

