## Import Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score as cvs
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV as GSCV

## Data Cleaning

In [2]:
eve = pd.read_csv('event_type.csv')
print(eve.info())
eve.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31170 entries, 0 to 31169
Data columns (total 2 columns):
id            31170 non-null int64
event_type    31170 non-null object
dtypes: int64(1), object(1)
memory usage: 487.1+ KB
None


Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [3]:
log = pd.read_csv('log_feature.csv')
print(log.info())
log.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58671 entries, 0 to 58670
Data columns (total 3 columns):
id             58671 non-null int64
log_feature    58671 non-null object
volume         58671 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.3+ MB
None


Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [4]:
res = pd.read_csv('resource_type.csv')
print(res.info())
res.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21076 entries, 0 to 21075
Data columns (total 2 columns):
id               21076 non-null int64
resource_type    21076 non-null object
dtypes: int64(1), object(1)
memory usage: 329.4+ KB
None


Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [5]:
sev = pd.read_csv('severity_type.csv')
print(sev.info())
sev.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18552 entries, 0 to 18551
Data columns (total 2 columns):
id               18552 non-null int64
severity_type    18552 non-null object
dtypes: int64(1), object(1)
memory usage: 290.0+ KB
None


Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [6]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [7]:
merge0 = pd.merge(left=eve, right=log)
merge0.head()

Unnamed: 0,id,event_type,log_feature,volume
0,6597,event_type 11,feature 68,6
1,8011,event_type 15,feature 68,7
2,2597,event_type 15,feature 68,1
3,5022,event_type 15,feature 172,2
4,5022,event_type 15,feature 56,1


In [8]:
merge1 = pd.merge(left=res,right=sev)
merge1.head()

Unnamed: 0,id,resource_type,severity_type
0,6597,resource_type 8,severity_type 2
1,8011,resource_type 8,severity_type 2
2,2597,resource_type 8,severity_type 2
3,5022,resource_type 8,severity_type 1
4,6852,resource_type 8,severity_type 1


In [9]:
merge2 = pd.merge(left=merge0, right=merge1)#, on='id')
print(merge2.info())
merge2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146423 entries, 0 to 146422
Data columns (total 6 columns):
id               146423 non-null int64
event_type       146423 non-null object
log_feature      146423 non-null object
volume           146423 non-null int64
resource_type    146423 non-null object
severity_type    146423 non-null object
dtypes: int64(2), object(4)
memory usage: 7.8+ MB
None


Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type
0,6597,event_type 11,feature 68,6,resource_type 8,severity_type 2
1,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2
2,2597,event_type 15,feature 68,1,resource_type 8,severity_type 2
3,5022,event_type 15,feature 172,2,resource_type 8,severity_type 1
4,5022,event_type 15,feature 56,1,resource_type 8,severity_type 1


In [10]:
df = pd.merge(left=train, right=merge2)#, on='id')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 8 columns):
id                61839 non-null int64
location          61839 non-null object
fault_severity    61839 non-null int64
event_type        61839 non-null object
log_feature       61839 non-null object
volume            61839 non-null int64
resource_type     61839 non-null object
severity_type     61839 non-null object
dtypes: int64(3), object(5)
memory usage: 4.2+ MB
None


Unnamed: 0,id,location,fault_severity,event_type,log_feature,volume,resource_type,severity_type
0,14121,location 118,1,event_type 34,feature 312,19,resource_type 2,severity_type 2
1,14121,location 118,1,event_type 34,feature 232,19,resource_type 2,severity_type 2
2,14121,location 118,1,event_type 35,feature 312,19,resource_type 2,severity_type 2
3,14121,location 118,1,event_type 35,feature 232,19,resource_type 2,severity_type 2
4,9320,location 91,0,event_type 34,feature 315,200,resource_type 2,severity_type 2


In [11]:
list(df.columns)

['id',
 'location',
 'fault_severity',
 'event_type',
 'log_feature',
 'volume',
 'resource_type',
 'severity_type']

In [12]:
col = list(df.columns)
del col[0]
del col[1]
del col[3]
col

['location', 'event_type', 'log_feature', 'resource_type', 'severity_type']

In [13]:
for i in range(len(col)):
    df[col[i]] = df[col[i]].apply(lambda x:x.split()[1])

In [14]:
df.head()

Unnamed: 0,id,location,fault_severity,event_type,log_feature,volume,resource_type,severity_type
0,14121,118,1,34,312,19,2,2
1,14121,118,1,34,232,19,2,2
2,14121,118,1,35,312,19,2,2
3,14121,118,1,35,232,19,2,2
4,9320,91,0,34,315,200,2,2


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 8 columns):
id                61839 non-null int64
location          61839 non-null object
fault_severity    61839 non-null int64
event_type        61839 non-null object
log_feature       61839 non-null object
volume            61839 non-null int64
resource_type     61839 non-null object
severity_type     61839 non-null object
dtypes: int64(3), object(5)
memory usage: 4.2+ MB


In [16]:
data = df.drop('fault_severity',axis=1)
data.head()

Unnamed: 0,id,location,event_type,log_feature,volume,resource_type,severity_type
0,14121,118,34,312,19,2,2
1,14121,118,34,232,19,2,2
2,14121,118,35,312,19,2,2
3,14121,118,35,232,19,2,2
4,9320,91,34,315,200,2,2


In [17]:
datadummy = pd.get_dummies(data,drop_first=True)
datadummy.head()

Unnamed: 0,id,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,location_1008,...,resource_type_4,resource_type_5,resource_type_6,resource_type_7,resource_type_8,resource_type_9,severity_type_2,severity_type_3,severity_type_4,severity_type_5
0,14121,19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,14121,19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,14121,19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,14121,19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9320,200,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [18]:
df = datadummy.groupby(['id'],sort=False).sum()
df.head()

Unnamed: 0_level_0,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,location_1008,location_1009,...,resource_type_4,resource_type_5,resource_type_6,resource_type_7,resource_type_8,resource_type_9,severity_type_2,severity_type_3,severity_type_4,severity_type_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14121,76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
9320,632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
14394,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
8218,44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
14804,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 14121 to 17067
Columns: 1320 entries, volume to severity_type_5
dtypes: float64(1319), int64(1)
memory usage: 74.4 MB


In [20]:
df = df.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 14121 to 17067
Columns: 1320 entries, volume to severity_type_5
dtypes: int32(1320)
memory usage: 37.2 MB


In [21]:
X = df
y = train['fault_severity']

In [22]:
Xtrain, Xtest, ytrain, ytest = tts(X,y, test_size=0.2, random_state=25)

## GridSearchCV

In [None]:
para_test1 = {'n_estimators':range(20,101,10)}

In [None]:
gb = GradientBoostingClassifier(min_samples_split=50, min_samples_leaf=50, max_depth=8, 
                                max_features='sqrt', subsample=0.8, random_state=10)

In [None]:
grids1 = GSCV(gb, para_test1, n_jobs=4, iid=False, cv=5) # Why doesn't scoring='roc_auc' work?
grids1.fit(X, y)
grids1.grid_scores_, grids1.best_params_, grids1.best_score_

In [None]:
para_test2 = {'max_depth':range(4,21,2), 'min_samples_split':range(200,1001,200)}

In [None]:
# This Graident Boosting with adjusted parameters.
gb = GradientBoostingClassifier(n_estimators=90, min_samples_leaf=50, 
                                max_features='sqrt', subsample=0.8, 
                                random_state=10)

In [36]:
grids2 = GSCV(gb, para_test2, n_jobs=4, iid=False, cv=5)
grids2.fit(X, y)
grids2.grid_scores_, grids2.best_params_, grids2.best_score_



([mean: 0.71210, std: 0.00979, params: {'max_depth': 4, 'min_samples_split': 200},
  mean: 0.71292, std: 0.00804, params: {'max_depth': 4, 'min_samples_split': 400},
  mean: 0.71156, std: 0.00905, params: {'max_depth': 4, 'min_samples_split': 600},
  mean: 0.71210, std: 0.00969, params: {'max_depth': 4, 'min_samples_split': 800},
  mean: 0.71278, std: 0.00848, params: {'max_depth': 4, 'min_samples_split': 1000},
  mean: 0.71617, std: 0.01031, params: {'max_depth': 6, 'min_samples_split': 200},
  mean: 0.71671, std: 0.00941, params: {'max_depth': 6, 'min_samples_split': 400},
  mean: 0.71711, std: 0.00912, params: {'max_depth': 6, 'min_samples_split': 600},
  mean: 0.71684, std: 0.00940, params: {'max_depth': 6, 'min_samples_split': 800},
  mean: 0.71630, std: 0.00875, params: {'max_depth': 6, 'min_samples_split': 1000},
  mean: 0.71522, std: 0.00910, params: {'max_depth': 8, 'min_samples_split': 200},
  mean: 0.71698, std: 0.00959, params: {'max_depth': 8, 'min_samples_split': 400},
  

In [51]:
para_test3 = {'min_samples_leaf':range(1,6,1)}

In [52]:
gb = GradientBoostingClassifier(n_estimators=90, max_depth=10, min_samples_split=600, 
                                max_features='sqrt', subsample=0.8, random_state=10)

In [53]:
grids3 = GSCV(gb, para_test3, n_jobs=4, iid=False, cv=5)
grids3.fit(X, y)
grids3.grid_scores_, grids3.best_params_, grids3.best_score_



([mean: 0.73256, std: 0.01312, params: {'min_samples_leaf': 1},
  mean: 0.73730, std: 0.01361, params: {'min_samples_leaf': 2},
  mean: 0.73351, std: 0.00886, params: {'min_samples_leaf': 3},
  mean: 0.73513, std: 0.01016, params: {'min_samples_leaf': 4},
  mean: 0.73378, std: 0.01035, params: {'min_samples_leaf': 5}],
 {'min_samples_leaf': 2},
 0.73730148614850521)

In [54]:
para_test4 = {'max_features':range(100,601,100)}

In [55]:
gb = GradientBoostingClassifier(n_estimators=90, max_depth=10, min_samples_split=600, 
                                min_samples_leaf=2, subsample=0.8, random_state=10)

In [56]:
grids4 = GSCV(gb, para_test4, n_jobs=4, iid=False, cv=5)
grids4.fit(X, y)
grids4.grid_scores_, grids4.best_params_, grids4.best_score_



([mean: 0.73567, std: 0.01063, params: {'max_features': 100},
  mean: 0.74109, std: 0.00801, params: {'max_features': 200},
  mean: 0.73839, std: 0.00709, params: {'max_features': 300},
  mean: 0.73906, std: 0.00711, params: {'max_features': 400},
  mean: 0.73879, std: 0.00988, params: {'max_features': 500},
  mean: 0.74028, std: 0.00909, params: {'max_features': 600}],
 {'max_features': 200},
 0.74109497886382736)

In [60]:
para_test5 = {'subsample':[0.5,0.6,0.7,0.8,0.9]}

In [61]:
gb = GradientBoostingClassifier(n_estimators=90, max_depth=10, min_samples_split=600, 
                                min_samples_leaf=2, max_features=200, random_state=10)

In [62]:
grids5 = GSCV(gb, para_test5, n_jobs=4, iid=False, cv=5)
grids5.fit(X, y)
grids5.grid_scores_, grids5.best_params_, grids5.best_score_



([mean: 0.73446, std: 0.00991, params: {'subsample': 0.5},
  mean: 0.73459, std: 0.00775, params: {'subsample': 0.6},
  mean: 0.73500, std: 0.00817, params: {'subsample': 0.7},
  mean: 0.74109, std: 0.00801, params: {'subsample': 0.8},
  mean: 0.73798, std: 0.00766, params: {'subsample': 0.9}],
 {'subsample': 0.8},
 0.74109497886382736)

In [24]:
gb = GradientBoostingClassifier(criterion='mae', learning_rate=0.05, n_estimators=180, max_depth=10, 
                                min_samples_split=600, min_samples_leaf=2, max_features=200, subsample=0.8, 
                                random_state=10, verbose=1)

In [27]:
gb.fit(X, y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        5567.5535          29.2271          215.36m
         2        5479.6501          24.1514          215.55m
         3        5381.6386          23.7988          213.71m
         4        5260.8325          26.2539          197.66m
         5        5165.7319          23.5625          187.69m
         6        5096.6274          22.6424          182.94m
         7        5012.8668          18.8624          180.05m
         8        4933.8093          17.2294          176.47m
         9        4866.5229          19.7382          171.52m
        10        4771.3141          17.3368          166.88m
        20        4294.1078           8.5704          149.56m
        30        4021.0603           3.9299          134.72m
        40        3863.7827           1.6317          123.45m
        50        3787.2343           1.1872          113.02m
        60        3682.9367           0.7893          103.40m
       

GradientBoostingClassifier(criterion='mae', init=None, learning_rate=0.05,
              loss='deviance', max_depth=10, max_features=200,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=2,
              min_samples_split=600, min_weight_fraction_leaf=0.0,
              n_estimators=180, presort='auto', random_state=10,
              subsample=0.8, verbose=1, warm_start=False)

In [25]:
score = cvs(gb, X, y, cv=5)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        4461.5296          24.9851          120.98m
         2        4375.7165          20.5097          131.65m
         3        4299.4191          20.5453          131.55m
         4        4212.5769          17.1335          123.31m
         5        4117.1621          19.3932          115.95m
         6        4057.6237          18.9229          111.37m
         7        3988.5276          15.2579          107.90m
         8        3933.5116          17.5720          105.43m
         9        3866.1500          16.2500          102.66m
        10        3772.2563          12.3610           99.67m
        20        3390.5020           6.1604           83.31m
        30        3189.7148           3.6619           72.47m
        40        3071.7588           1.8783           64.41m
        50        2966.2098           0.9622           56.72m
        60        2886.5977           0.8844           50.47m
       

In [26]:
score

array([ 0.7165088 ,  0.72764228,  0.74322493,  0.73509485,  0.73288136])

In [29]:
gb1 = GradientBoostingClassifier(criterion='mae', learning_rate=0.1, n_estimators=90, max_depth=10, 
                                min_samples_split=600, min_samples_leaf=2, max_features=200, subsample=0.8, 
                                random_state=10, verbose=1)

In [30]:
gb1.fit(X, y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        5451.3470          56.7833          105.54m
         2        5257.7922          49.2663          100.27m
         3        5074.3923          44.4883           95.52m
         4        4905.5670          35.3395           87.03m
         5        4757.6835          35.9059           82.60m
         6        4668.8006          29.2577           80.44m
         7        4551.5618          23.8535           77.36m
         8        4473.7844          18.7352           75.49m
         9        4398.9050          21.0421           73.71m
        10        4285.2741          19.8271           71.43m
        20        3880.6252           3.3563           61.27m
        30        3683.1331           2.3862           50.71m
        40        3556.5474           0.4905           40.57m
        50        3496.2155           0.3415           31.13m
        60        3399.6343           0.3996           22.97m
       

GradientBoostingClassifier(criterion='mae', init=None, learning_rate=0.1,
              loss='deviance', max_depth=10, max_features=200,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=2,
              min_samples_split=600, min_weight_fraction_leaf=0.0,
              n_estimators=90, presort='auto', random_state=10,
              subsample=0.8, verbose=1, warm_start=False)

In [31]:
score1 = cvs(gb1, X, y, cv=5)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        4356.9331          50.8901           51.22m
         2        4200.9276          36.5678           53.24m
         3        4076.1003          33.0664           55.66m
         4        3923.3665          31.5663           53.02m
         5        3787.9751          28.0461           49.90m
         6        3703.3443          25.3530           47.69m
         7        3613.8294          20.7976           46.55m
         8        3547.6363          19.5318           45.47m
         9        3482.1078          16.6155           44.39m
        10        3372.4364          11.2122           43.26m
        20        3061.1656           4.2253           33.62m
        30        2915.6104           1.1975           26.15m
        40        2792.0497           1.0502           20.09m
        50        2682.9587           0.1680           15.16m
        60        2615.3680          -0.4907           10.94m
       

In [32]:
score1

array([ 0.72056834,  0.72222222,  0.74119241,  0.72425474,  0.73152542])

In [59]:
probability = gb1.predict_proba(X)

In [40]:
results = gb1.predict(X)

In [88]:
probability

array([[ 0.96254338,  0.03261109,  0.00484553],
       [ 0.97070031,  0.02385767,  0.00544202],
       [ 0.52093234,  0.46896634,  0.01010132],
       ..., 
       [ 0.17374381,  0.21072251,  0.61553367],
       [ 0.88997048,  0.08433629,  0.02569323],
       [ 0.87915458,  0.10494066,  0.01590476]])

In [121]:
p1 = []
p2 = []
for i in range(len(probability)):
    p1.append(probability[i][1])
    p2.append(probability[i][2])

In [123]:
file = pd.DataFrame(columns=['location', 'fault_severity', 'proba_0', 'proba_1', 'proba_2'])

In [124]:
file['location'] = train['location']
file['fault_severity'] = results
file['proba_0'] = probability
file['proba_1'] = p1
file['proba_2'] = p2

In [125]:
file.head()

Unnamed: 0,location,fault_severity,proba_0,proba_1,proba_2
0,location 118,0,0.962543,0.032611,0.004846
1,location 91,0,0.9707,0.023858,0.005442
2,location 152,0,0.520932,0.468966,0.010101
3,location 931,0,0.545129,0.258165,0.196706
4,location 120,0,0.701579,0.258879,0.039542
