In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from scipy import stats
plt.rc("figure", figsize=(16,8))
plt.rc("font", size=14)

In [2]:
df=pd.read_csv('train.csv') # create train and test set with answers

In [3]:
# feature engineering

def predictors_ordinal_encoding(df):

    ''' one hot encode categorical features '''

    # find all relevant columns
    all_columns = list(df.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))
    
    # Encoding make column using LabelEncoder
    labelencoder = LabelEncoder()

    for i in categoric_columns:
        df[i + '_ord'] = labelencoder.fit_transform(df[i])

    # remove categoric cols
    numeric_columns = df.select_dtypes(include=numeric_types).columns.to_list()
    df = df[numeric_columns]

    return df

df = predictors_ordinal_encoding(df)

In [4]:
def training_validation_subset(df):
    ''' function to create training and validation subsets
        chosen this methodology as a method to replicate in the future '''

    training_df = df.sample(frac=0.7)
    print('Training dataset rows:\t', training_df.shape[0])

    validation_df = pd.concat([df, training_df]).drop_duplicates(keep=False)
    print('Validation dataset rows:\t', validation_df.shape[0])

    return training_df, validation_df

training_df, validation_df=training_validation_subset(df)

Training dataset rows:	 131823
Validation dataset rows:	 56495


In [5]:
X=training_df.drop(['id', 'loss'], axis=1)
y=training_df['loss']

def feature_selection(training_df, X, y, n_cols):

    scaler = StandardScaler()
    scaler.fit(X.fillna(0))

    sel_ = SelectFromModel(Lasso()) #C=1, penalty='l1', solver='liblinear'
    sel_.fit(scaler.transform(X.fillna(0)), y)

    coef = pd.Series(abs(sel_.estimator_.coef_), index = X.columns)
    imp_coef = coef.sort_values(ascending=False)
    cols=list(imp_coef[:n_cols].index)
    removed_feats = list(X.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()])
    print(removed_feats)
    return removed_feats

In [6]:
cols=feature_selection(training_df, X, y, 5) #['cat80_ord', 'cat79_ord', 'cat57_ord', 'cont7', 'cont2']

['cont6', 'cont13', 'cat89_ord', 'cat9_ord', 'cat90_ord', 'cat40_ord', 'cat111_ord', 'cat14_ord']


In [None]:
# this runs but seems to time out

# # iterate through models to get best fitting on valid

# predictors=[]
# mae=[]

# for i in list(training_df.drop(['id', 'loss'], axis=1).columns):
    
#     predictors.append(i)
#     X=training_df[predictors]
#     y=training_df['loss']
    
#     try:
#         glm_gamma = sm.GLM(endog=y, exog=X, family=sm.families.Gamma(sm.families.links.log()))
#         glm_results = glm_gamma.fit()
#         y_pred=glm_results.predict(validation_df[predictors])
#         y_valid=validation_df['loss']
#         result=mean_absolute_error(y_pred, y_valid)
#         print(predictors, result)
#         mae.append(result)
        
#     except ValueError:
#         print("skipped", i)
#         predictors.remove(i)
        
# glm_performance=pd.DataFrame({'Predictors':predictors,'MAE':mae})
# glm_performance.to_csv('glm_perf.csv', index=False)

['cont1'] 2698151209.084389
['cont1', 'cont2'] 950909.4688806424
['cont1', 'cont2', 'cont3'] 50208.65570966887
['cont1', 'cont2', 'cont3', 'cont4'] 34244.71840284773
['cont1', 'cont2', 'cont3', 'cont4', 'cont5'] 24643.12427185591
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6'] 20049.637220362576
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7'] 16841.874402871963
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8'] 16262.327245686967
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9'] 15232.809028240821
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10'] 14986.103721426014


  return np.exp(z)
  endog_mu = self._clean(endog / mu)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


skipped cont11


  endog_mu = self._clean(endog / mu)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return 2 * resid_dev
  resid = np.power(self.endog - mu, 2) * self.iweights
  return np.power(np.fabs(mu), self.power)
  return np.sum(resid / self.family.variance(mu)) / self.df_resid


skipped cont12
skipped cont13
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14'] 11436.579787729526
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord'] 11038.040233282753
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord'] 11030.077397348075
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord'] 11787.43581445171


  return np.exp(z)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid = np.power(self.endog - mu, 2) * self.iweights
  return np.power(np.fabs(mu), self.power)
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


skipped cat107_ord
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord'] 11763.194521310645
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord'] 11787.49251280858
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord'] 15584.673070301249
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord'] 15580.428622485997
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord'] 15335.930547

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord'] 8202.018844710696
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord'] 7449.18024

  return np.exp(z)
  endog_mu = self._clean(endog / mu)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


skipped cat115_ord
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord'] 3749.9944912620326
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat5

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord'] 201251.01988774003


  return np.exp(z)
  endog_mu = self._clean(endog / mu)
  endog_mu = self._clean(endog / mu)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return 2 * resid_dev
  resid = np.power(self.endog - mu, 2) * self.iweights
  return np.power(np.fabs(mu), self.power)
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


skipped cat106_ord
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord'] 173972.93926467808
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord'] 97081.2065659474
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont

  return np.exp(z)
  endog_mu = self._clean(endog / mu)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  - self._offset_exposure)


skipped cat77_ord
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord'] 85509.04325980027
['cont1', 'cont2', 'cont3', '

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord', 'cat92_ord', 'cat95_ord', 'cat85_ord', 'cat64_ord', 'cat47_ord', 

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord', 'cat92_ord', 'cat95_ord', 'cat85_ord', 'cat64_ord', 'cat47_ord', 

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord', 'cat92_ord', 'cat95_ord', 'cat85_ord', 'cat64_ord', 'cat47_ord', 

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord', 'cat92_ord', 'cat95_ord', 'cat85_ord', 'cat64_ord', 'cat47_ord', 

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont14', 'cat5_ord', 'cat59_ord', 'cat87_ord', 'cat28_ord', 'cat45_ord', 'cat76_ord', 'cat39_ord', 'cat6_ord', 'cat102_ord', 'cat104_ord', 'cat7_ord', 'cat52_ord', 'cat11_ord', 'cat56_ord', 'cat89_ord', 'cat101_ord', 'cat22_ord', 'cat37_ord', 'cat71_ord', 'cat41_ord', 'cat74_ord', 'cat57_ord', 'cat1_ord', 'cat54_ord', 'cat42_ord', 'cat82_ord', 'cat73_ord', 'cat108_ord', 'cat83_ord', 'cat109_ord', 'cat13_ord', 'cat63_ord', 'cat21_ord', 'cat2_ord', 'cat18_ord', 'cat93_ord', 'cat9_ord', 'cat25_ord', 'cat66_ord', 'cat60_ord', 'cat43_ord', 'cat50_ord', 'cat105_ord', 'cat3_ord', 'cat78_ord', 'cat110_ord', 'cat90_ord', 'cat53_ord', 'cat69_ord', 'cat23_ord', 'cat4_ord', 'cat99_ord', 'cat79_ord', 'cat58_ord', 'cat114_ord', 'cat112_ord', 'cat70_ord', 'cat67_ord', 'cat40_ord', 'cat30_ord', 'cat36_ord', 'cat12_ord', 'cat75_ord', 'cat88_ord', 'cat92_ord', 'cat95_ord', 'cat85_ord', 'cat64_ord', 'cat47_ord', 

In [None]:
print('hi')

In [None]:
# Choose a significance level (e.g. SL = 0.05 with a 95% confidence).
# Fit all possible simple regression models by considering one feature at a time. Total ’n’ models are possible. Select the feature with the lowest p-value.
# Fit all possible models with one extra feature added to the previously selected feature(s).
# Again, select the feature with a minimum p-value. if p_value < significance level then go to Step 3, otherwise terminate the process.

def pvalue_loop(df, y, full_cols, previous_loops):

    predictors=[]
    p_value=[]

    for predictor in full_cols:
        
        previous_loops.append(predictor)
        print(previous_loops)
        glm_gamma = sm.GLM(endog=df[y], exog=df[predictor], family=sm.families.Gamma(sm.families.links.log()))
        glm_results = glm_gamma.fit()
        predictors.append(i)
        p_value.append(min(glm_results.pvalues))
        previous_loops.remove(predictor)

    best_col=pd.DataFrame({'Predictors':predictors,'pvalue':p_value}).sort_values(by='pvalue')[:1].values[0][0]
    best_p=pd.DataFrame({'Predictors':predictors,'pvalue':p_value}).sort_values(by='pvalue')[:1].values[0][1]
    return best_col, best_p

In [None]:
df=training_df
y='loss'
full_cols=['cont1', 'cont2', 'cont3']
previous_loops=[]

loop_cols = list(set(full_cols) - set(predictors_loop))

previous_loops.append(pvalue_loop(df, y, full_cols, previous_loops)[0])

pvalue_loop(df, y, loop_cols, previous_loops)[0]

In [None]:
# loop_cols=['cont1', 'cont2', 'cont3']
# # full_cols=(df.drop(['id', 'loss'], axis=1).columns)
# X = training_df[cols]

In [None]:
# predictors_loop=[]
# predictors_loop.append(pvalue_loop(training_df, 'loss', full_cols)[0])
# print(predictors_loop)

# loop_cols = list(set(full_cols) - set(predictors_loop))
# print(loop_cols)

# for i in loop_cols:
#     pvalue_loop(training_df, 'loss', loop_cols)
    
    
# print(predictors_loop)

In [None]:

X=training_df[cols]
y=training_df['loss']

glm_gamma = sm.GLM(endog=y, exog=X, family=sm.families.Gamma(sm.families.links.log()))
glm_results = glm_gamma.fit()
glm_results.pvalues

# y_pred=glm_results.predict(validation_df[predictors])
# y_valid=validation_df['loss']
# result=mean_absolute_error(y_pred, y_valid)
# print(result)

In [None]:
def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    data=data.drop(['id'], axis=1)
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            print(candidate)
#             formula = "{} ~ {} + 1".format(response,
#                                            ' + '.join(selected + [candidate]))
            score = sm.GLM(endog=response, exog=X).fit().deviance
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = sm.GLM(endog=response, exog=X).fit()
    return model

In [None]:
forward_selected(training_df, 'loss')

In [None]:
cols=['cont1', 'cont2', 'cont3']
X=training_df.drop(['id'], axis=1)[cols]
y=training_df['loss']
formula = "{} ~ {} + 1".format(response, ' + '.join(cols)) #selected + [candidate]

score

#fmodel=forward_selected(X_train, 'loss')

In [None]:
X=training_df[cols]
y=training_df['loss']

glm_gamma = sm.GLM(endog=y, exog=X, weights = weights, family=sm.families.Gamma(sm.families.links.log()))
glm_results = glm_gamma.fit()

In [None]:
# predict
from sklearn.metrics import mean_absolute_error

y_pred=glm_results.predict(validation_df[cols])
y_valid=validation_df['loss']

mean_absolute_error(y_pred, y_valid)

In [None]:
#thought process here:

from itertools import compress, product

def combinations(items):
    return ( set(compress(items,mask)) for mask in product(*[[0,1]]*len(items)) )
    # alternative:                      ...in product([0,1], repeat=len(items)) )

len(list(combinations(range(122))))

In [None]:
print(glm_results.summary())

In [None]:
glm

In [None]:
print(glm_results.params)
print(glm_results.scale)
print(glm_results.deviance)
print(glm_results.pearson_chi2)
print(glm_results.llf)

In [None]:
glm_gamma.exog_names