# Training model for Regression

### This notebook demonstrates data exploratory analysis and buidling ensemble of regression models.

In [6]:
#import required modules
import numpy as np
import pandas as pd
import pandas_profiling as pp
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from sklearn.model_selection import train_test_split


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
# set local variables

base_path='E:/Wh/'

rawdata=pd.read_csv(base_path+'dataset_00_with_header.csv')



In [8]:
rawdata.head()

Unnamed: 0,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,...,x296,x297,x298,x299,x300,x301,x302,x303,x304,y
0,1540332,,,,8.0,1,0,1,0,0,...,0,,0,0,0,0,,0,,706
1,823066,4.0,3.0,3.0,4.0,0,2,2,0,0,...,5206,0.9339,1,1,1,0,,0,,558
2,1089795,,,,96.0,1,0,0,0,1,...,0,,0,0,0,0,,0,,577
3,1147758,63.0,14.0,38.0,258.0,0,0,0,1,2,...,0,,1,1,1,0,,0,,526
4,1229670,34.0,25.0,29.0,34.0,1,0,0,0,3,...,0,,0,0,0,0,,0,,496


In [9]:
## Just looking at first few rows of data following are my observations ##

#lot of missing values in dataset
#dependent y variable has continuous values, credit score probably? 


In [10]:
#first column has high values with very high variance
#inspecting its basic stats 

rawdata.x001.describe()

count    1.000000e+05
mean     1.218244e+06
std      2.728977e+05
min      5.170000e+02
25%      9.743635e+05
50%      1.235926e+06
75%      1.445326e+06
max      1.677197e+06
Name: x001, dtype: float64

In [11]:
#check if any variables have constant value

zero_stdev=[col for col in rawdata.columns.values.tolist() if rawdata[col].std()==0]

In [13]:
#there are 4 such columns
zero_stdev

['x067', 'x094', 'x095', 'x096']

In [14]:
rawdata[zero_stdev].describe()

Unnamed: 0,x067,x094,x095,x096
count,100000.0,100000.0,100000.0,100000.0
mean,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0


In [15]:
#we can remove these from analysis 
cols_to_drop=[]
cols_to_drop.append(zero_stdev)

In [16]:
# check if any of the columns have null/missing values more than 80% of the records 

null_cols=[col for col in rawdata.columns.values.tolist() if sum(rawdata[col].isna())/rawdata.shape[0] > 0.8]


In [17]:
null_cols

['x098', 'x242', 'x295', 'x304']

In [18]:
rawdata['x098'][rawdata['x098'].isna()]

0       NaN
1       NaN
4       NaN
5       NaN
6       NaN
         ..
99994   NaN
99995   NaN
99996   NaN
99997   NaN
99999   NaN
Name: x098, Length: 80681, dtype: float64

In [19]:
# we will exclude these columns for now 
cols_to_drop.append(null_cols)

In [20]:
# finally we will look at correlated variables 
# for this part I have referenced code from this blog post https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/


corr_matrix = rawdata.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
corrcols = [column for column in upper.columns if any(upper[column] > 0.95)]

In [21]:
cols_to_drop.append(corrcols)

In [22]:
import itertools

In [23]:
cols_to_drop_list=list(itertools.chain(*cols_to_drop))

In [24]:
len(cols_to_drop_list)

75

In [25]:
processed_df=rawdata.drop(cols_to_drop_list,axis=1)

In [26]:
processed_df.shape

(100000, 230)

In [27]:
#  lets split data into train,val,test set 
y_df=processed_df['y']
y_df.head()
x_df=processed_df.drop(['y'],axis=1)
x_df.head()

Unnamed: 0,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,...,x290,x291,x293,x294,x296,x297,x298,x301,x302,x303
0,1540332,,,,8.0,1,0,1,0,0,...,,0,,0,0,,0,0,,0
1,823066,4.0,3.0,3.0,4.0,0,2,2,0,0,...,3.5,5206,0.9339,0,5206,0.9339,1,0,,0
2,1089795,,,,96.0,1,0,0,0,1,...,,0,,0,0,,0,0,,0
3,1147758,63.0,14.0,38.0,258.0,0,0,0,1,2,...,63.0,57762,0.8231,57762,0,,1,0,,0
4,1229670,34.0,25.0,29.0,34.0,1,0,0,0,3,...,,0,,0,0,,0,0,,0


In [28]:
x_df.shape

(100000, 229)

In [29]:
x_df=x_df.fillna(x_df.mean())

In [30]:
x_df.head()

Unnamed: 0,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,...,x290,x291,x293,x294,x296,x297,x298,x301,x302,x303
0,1540332,125.711727,25.541238,65.393212,8.0,1,0,1,0,0,...,38.331656,0,0.812575,0,0,0.794034,0,0,2.948758,0
1,823066,4.0,3.0,3.0,4.0,0,2,2,0,0,...,3.5,5206,0.9339,0,5206,0.9339,1,0,2.948758,0
2,1089795,125.711727,25.541238,65.393212,96.0,1,0,0,0,1,...,38.331656,0,0.812575,0,0,0.794034,0,0,2.948758,0
3,1147758,63.0,14.0,38.0,258.0,0,0,0,1,2,...,63.0,57762,0.8231,57762,0,0.794034,1,0,2.948758,0
4,1229670,34.0,25.0,29.0,34.0,1,0,0,0,3,...,38.331656,0,0.812575,0,0,0.794034,0,0,2.948758,0


In [31]:
train_x,val_x,train_y,val_y=train_test_split(x_df,y_df,test_size=0.2,random_state=42)

In [32]:
val_x,test_x,val_y,test_y=train_test_split(val_x,val_y,test_size=0.5,random_state=42)

In [33]:
##scale the target variable with max value from train data ##

mxy=train_y.max()
train_y=train_y/mxy
val_y=val_y/mxy
test_y=test_y/mxy

In [34]:
featureNames=train_x.columns.values.tolist()

In [None]:
#now we will scale input variables using minmax scaler
from sklearn.preprocessing import MinMaxScaler
mnx=MinMaxScaler()
mnx.fit(train_x)
train_x=pd.DataFrame(mnx.fit_transform(train_x),columns=featureNames)
val_x=pd.DataFrame(mnx.transform(val_x),columns=featureNames)
test_x=pd.DataFrame(mnx.transform(test_x),columns=featureNames)
from sklearn.externals import joblib
joblib.dump(mnx,base_path+'scaler_model.pkl')

In [36]:
### Now we will start model building process ### 
# baseline model will be linear regression, since y is a continuous variable
# we will try few variations of linear regression to see performance improvement
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [37]:
reg=LinearRegression()
reg.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [38]:
reg.score(train_x,train_y)

0.8447864173788956

In [39]:
reg_y=reg.predict(val_x)
reg_test_y=reg.predict(test_x)

In [40]:
val_score=np.sqrt(mean_squared_error(reg_y,val_y))
test_score=np.sqrt(mean_squared_error(reg_test_y,test_y))


In [41]:
print("val set accuracy ",val_score," & test set accuracy ",test_score)

val set accuracy  0.05616981077167625  & test set accuracy  0.05542299847337063


In [42]:
# we have baseline results, rmse values are very high !
# lets see some variations of linear regression
# before that lets see accuracy on training data
train_score=np.sqrt(mean_squared_error(reg.predict(train_x),train_y))
print("train set accuracy : ",train_score)

train set accuracy :  0.05559125229173456


In [43]:
# train set accuracy much better, clearly indicating this might be case of overfitting
# variations of linear regression with regularization implemented might improve performance in this case
from sklearn import linear_model

In [44]:
reglasso=linear_model.Lasso(alpha=0.1)
reglasso.fit(train_x,train_y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [45]:
lasso_val_y=reglasso.predict(val_x)
lasso_test_y=reglasso.predict(test_x)
lasso_train_y=reglasso.predict(train_x)
lasso_acc_train=np.sqrt(mean_squared_error(lasso_train_y,train_y))
lasso_acc_val=np.sqrt(mean_squared_error(lasso_val_y,val_y))
lasso_acc_test=np.sqrt(mean_squared_error(lasso_test_y,test_y))
print("train accuracy : ",lasso_acc_train," val accuracy: ", lasso_acc_val," test accuracy: ",lasso_acc_test)

train accuracy :  0.14110473768475676  val accuracy:  0.14159912584883957  test accuracy:  0.1414968070061699


In [46]:
## significant improvement in accuracy with lasso regression ##
## lets try to see what is optimal alpha parameter value through cross validation ##
from sklearn.linear_model import LassoCV
epsilon = 1e-4

In [47]:
regl = LassoCV(cv=5, random_state=41).fit(train_x, train_y)

In [48]:

y_predicted = regl.predict(X=val_x)

rmse_lasso = np.sqrt(mean_squared_error(y_predicted,val_y))

In [49]:
rmse_lasso

0.05780875371546379

In [50]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [51]:
regr = RandomForestRegressor(max_depth=5, random_state=41,
...                              n_estimators=100)

In [52]:
regr.fit(train_x,train_y)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=41, verbose=0, warm_start=False)

In [53]:
predicted_rf=regr.predict(val_x)
print("rf accuracy ",np.sqrt(mean_squared_error(predicted_rf,val_y)))

rf accuracy  0.056860535212525265


In [54]:
from sklearn import ensemble

In [55]:
# Fit regression model
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
reg_boost = ensemble.GradientBoostingRegressor(**params)

reg_boost.fit(train_x, train_y)
rmse = np.sqrt(mean_squared_error(val_y, reg_boost.predict(val_x)))
print("RMSE: %.4f" % rmse)

RMSE: 0.0392


In [56]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import cross_validation, metrics   #Additional sklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search




In [57]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': [3,4,5,6], 'alpha': 10}

In [58]:
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.05, 0.07,0.1], #so called `eta` value
              'max_depth': [3,4,5,6],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100]}

In [59]:
xgb1=XGBRegressor()

In [60]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        verbose=True)

In [61]:
xgb_grid.fit(train_x,train_y)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

GridSearchCV(cv=2, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'objective': ['reg:squarederror'], 'learning_rate': [0.03, 0.05, 0.07, 0.1], 'max_depth': [3, 4, 5, 6], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [62]:
best_xgb_param=xgb_grid.best_params_

In [63]:
final_xgb=XGBRegressor(parameters=best_xgb_param)

In [64]:
final_xgb.fit(train_x,train_y)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear',
       parameters={'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 100, 'objective': 'reg:squarederror', 'silent': 1, 'subsample': 0.7},
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [65]:
xgb_result=final_xgb.predict(val_x)
#xgb_result=xgb_result*mxy


In [66]:
val_score_xgb=np.sqrt(mean_squared_error(xgb_result,val_y))
test_score_xgb=np.sqrt(mean_squared_error(final_xgb.predict(test_x),test_y))

In [67]:
print(val_score_xgb," test results ",test_score_xgb)

0.038534765112044435  test results  0.03817795610986214


In [70]:
### combine all model results into one dataframe choose best model and save it to disk ### 
custom_accr(np.abs(val_y-xgb_result))

9.99

In [410]:
test123=reg.predict(valid_x)

In [414]:
if len(test123.shape)>1:
    print("yes")
else:
    print("No")

No


In [71]:
# helper function to compute accuracy of the model based on instructions 
def custom_accr(arrdiff):
    return(len(arrdiff[arrdiff<=3/mxy])*100/len(arrdiff))

In [72]:
def model_predictions(model,val_data,test_data,modelid):
    valid_x,valid_y=val_data
    testing_x,testing_y=test_data
    valid_predictions=model.predict(valid_x)
    valid_pred_score=np.sqrt(mean_squared_error(valid_predictions,valid_y))
    #special handling for linreg output
    #if len(valid_predictions.shape)>1:
     #   valdiff=np.abs(np.concatenate(valid_predictions)-np.concatenate(valid_y.values))
    #else:
    valdiff=np.abs(valid_predictions-valid_y.values)
    valid_custom_score=custom_accr(valdiff)
    testing_predictions=model.predict(testing_x)
    testing_pred_score=np.sqrt(mean_squared_error(testing_predictions,testing_y))
    #if len(testing_predictions.shape)>1:
    #    test_custom_score=custom_accr(np.abs(np.concatenate(testing_predictions)-np.concatenate(testing_y.values)))
    #else:
    test_custom_score=custom_accr(np.abs(testing_predictions-testing_y.values))
    
    result=pd.DataFrame({'modelName':[modelid],'valid_rmse':[valid_pred_score],'valid_custom_score':[valid_custom_score],
                         'test_rmse':[testing_pred_score],'test_custom_score':[test_custom_score]})
    return(result)
    

In [73]:
model_list=[reg,reglasso,regl,regr,reg_boost,final_xgb]
modelidhash={reg:'VanillaLinearRegression',reglasso:'LassoRegression',regl:'LassoRegCV',regr:'RandomForestRegr',reg_boost:'GradientBoostingReg',
            final_xgb:'XGBoostCV'}

In [74]:
model_results_=[model_predictions(mod,(val_x,val_y),(test_x,test_y),modelidhash[mod]) for mod in model_list]

In [75]:
model_results=pd.concat(model_results_)

In [76]:
model_results

Unnamed: 0,modelName,valid_rmse,valid_custom_score,test_rmse,test_custom_score
0,VanillaLinearRegression,0.05617,5.66,0.055423,5.82
0,LassoRegression,0.141599,1.39,0.141497,1.38
0,LassoRegCV,0.057809,5.13,0.056916,5.63
0,RandomForestRegr,0.056861,5.51,0.055989,6.04
0,GradientBoostingReg,0.039224,10.02,0.038687,10.18
0,XGBoostCV,0.038535,9.99,0.038178,10.3


In [77]:
mxy

839

In [None]:
joblib.dump(x_df.columns.values.tolist(),base_path+'colnames.pkl')

In [None]:
joblib.dump(final_xgb,base_path+'xgb_model.pkl')

In [80]:
rawdata['y'].shape

(100000,)

In [81]:
model_results.to_csv(base_path+'modelcomparisons.csv',index=False)