In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import time as t
from sklearn import preprocessing as pre

path="C:\\dev\\telstra\\data\\"

trainFile="train.csv"
testFile="test.csv"
resourceFile="resource_type.csv"
eventTypeFile="event_type.csv"
logFeatureFile="log_feature.csv"
severityTypeFile="severity_type.csv"

test=pd.read_csv(filepath_or_buffer=path+testFile,delimiter=",",header=0)
train=pd.read_csv(filepath_or_buffer=path+trainFile,delimiter=",",header=0)

resource=pd.read_csv(filepath_or_buffer=path+resourceFile,delimiter=",",header=0)
event=pd.read_csv(filepath_or_buffer=path+eventTypeFile,delimiter=",",header=0)
feature=pd.read_csv(filepath_or_buffer=path+logFeatureFile,delimiter=",",header=0)
severity=pd.read_csv(filepath_or_buffer=path+severityTypeFile,delimiter=",",header=0)
join=pd.DataFrame({'id':[]})
datasets={'train':train,'test':test,'resource':resource,'event':event,'feature':feature,'severity':severity}
notCat=['id','volume']

#convert table of ids and classes to binary row feature vectors for each unique id
#takes a long time, thus once computed save results to csv file
for key, dataset in datasets.items():
    tmp=pd.get_dummies(dataset,dummy_na=True)
    tmp=tmp.groupby('id').sum()
    tmp['id']=tmp.index
    dataset=tmp
    datasets[key]=dataset
    
#creates join dataset table that will store features for train and test datasets
datasets['train']['sample']='train'
datasets['test']['sample']='test'
datasets['test']['fault_severity']=np.nan
join=pd.concat([datasets['train'],datasets['test']],ignore_index=True)

#joins features from different datasets into one dataframe, features only for evaluated ids are extracted
#(i.e. some features may be not related to any id)
samples=['train','test']
for key, dataset in datasets.items():
    if key not in samples:
        join=pd.merge(join,dataset,on='id',how='left') #can use concat?

join=join.set_index(['sample',join.index])

#normalizes a continuous feature
join['volume']=pre.scale(join['volume'])

#fills in missing data
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
join = DataFrameImputer().fit_transform(join)
#join=join.fillna(value=0)

#creates features and targets for final training
features=join.columns.values
features=features[2:]

train_features = join.loc['train'][features]
train_target = join.loc['train']['fault_severity']

test_features = join.loc['test'][features]

In [120]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import RidgeClassifierCV, Perceptron, SGDClassifier, PassiveAggressiveClassifier
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.metrics import log_loss, accuracy_score, classification_report
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier
from sklearn import grid_search

In [77]:
train_x, test_x, train_y, test_y = train_test_split(train_features, train_target, test_size=0.31, random_state=0)
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = t.time()
    #clf=OneVsRestClassifier(clf) #may want to remove for testing
    clf.fit(train_x, train_y)
    train_time = t.time() - t0
    print("Training time: %0.3fs" % train_time)

    t0 = t.time()
    y_pred_single = clf.predict(test_x)
    if (hasattr(clf,'predict_proba')):
        y_pred_dist = clf.predict_proba(test_x)
        logloss=log_loss(test_y,y_pred_dist)
        print("LogLoss: %0.3fs" % logloss)
    test_time = t.time() - t0
    print("Prediction time:  %0.3fs" % test_time)

    acc_score = accuracy_score(test_y, y_pred_single)
    print("Accuracy:   %0.3f" % acc_score)
    
    #if (hasattr(clf,'predict_proba')):
       #scores = cross_val_score(clf,train_features,train_target, scoring='log_loss')
        #loss_score=scores.mean()
        #print("CV LogLoss: %0.2f (+/- %0.2f)" % (loss_score, scores.std() * 2))
    
    print(classification_report(test_y, y_pred_single))
    clf_descr = str(clf).split('(')[0]
    
    if (hasattr(clf,'predict_proba')):
        return clf_descr, acc_score, logloss, train_time, test_time
    else:
        return clf_descr, acc_score, train_time, test_time

In [79]:
results = []
for clf, name in (
        (RidgeClassifierCV(), "Ridge Classifier"),
        (RandomForestClassifier(n_estimators=300), "Random forest"),
        (AdaBoostClassifier(), "AdaBoost"),
        (BaggingClassifier(), "Bagging"),
        (QuadraticDiscriminantAnalysis(), "DiscriminatnAnalysis")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifierCV(alphas=(0.1, 1.0, 10.0), class_weight=None, cv=None,
         fit_intercept=True, normalize=False, scoring=None)
Training time: 4.643s
Prediction time:  0.038s
Accuracy:   0.727
             precision    recall  f1-score   support

        0.0       0.79      0.91      0.84      1476
        1.0       0.59      0.35      0.44       593
        2.0       0.49      0.55      0.52       220

avg / total       0.71      0.73      0.71      2289

Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verb



In [115]:
#test a classifier

# Fit the model to training data
#parameters={'n_estimators':[100,150]},'max_depth':[3]}
clf = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=200,verbose=1,presort=True,max_depth=7),n_jobs=-2)#,min_samples_leaf=10,min_samples_split=5)
#clf=grid_search.GridSearchCV(gbc, parameters,n_jobs=-2)
benchmark(clf)

________________________________________________________________________________
Training: 
OneVsRestClassifier(estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200, presort=True,
              random_state=None, subsample=1.0, verbose=1,
              warm_start=False),
          n_jobs=-2)
Training time: 92.315s
Prediction time:  0.573s
Accuracy:   0.739
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.84      1476
        1.0       0.61      0.42      0.50       593
        2.0       0.55      0.60      0.58       220

avg / total       0.73      0.74      0.73      2289



('OneVsRestClassifier',
 0.73918741808650068,
 0.57045593736126277,
 92.31480956077576,
 0.5728719234466553)

In [122]:
#test a classifier

# Fit the model to training data
#parameters={'n_estimators':[100,150]},'max_depth':[3]}
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=400,verbose=1,min_samples_split=10,\
                                                 oob_score=True,n_jobs=-2),n_jobs=-2)
#clf=grid_search.GridSearchCV(gbc, parameters,n_jobs=-2)
benchmark(clf)

________________________________________________________________________________
Training: 
OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-2,
            oob_score=True, random_state=None, verbose=1, warm_start=False),
          n_jobs=-2)
Training time: 31.099s

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Done  44 tasks    


Prediction time:  1.980s
Accuracy:   0.751
             precision    recall  f1-score   support

        0.0       0.80      0.90      0.85      1476
        1.0       0.63      0.44      0.52       593
        2.0       0.59      0.56      0.57       220

avg / total       0.74      0.75      0.74      2289



('OneVsRestClassifier',
 0.75141983398864132,
 0.54245802948043198,
 31.099143743515015,
 1.9801154136657715)

In [110]:
scores = cross_val_score(clf,train_features,train_target, scoring='log_loss',n_jobs=-2,verbose=True)
print("LogLoss: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogLoss: -0.56 (+/- 0.01)


[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:   44.6s finished


In [111]:
bestCLF=clf

In [112]:
bestCLF.fit(train_features,train_target)

[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-2)]: Done 400 out of 400 | elapsed:   10.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=800,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-2,
            oob_score=True, random_state=None, verbose=1, warm_start=False)

In [113]:
test_response=bestCLF.predict_proba(test_features)

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 400 out of 400 | elapsed:    0.7s finished


In [114]:
#save predictions to required format 
#make it binary?
out=pd.DataFrame({'id':[],'predict_0':[],'predict_1':[],'predict_2':[]})
out['id']=test['id']
out['predict_0']=test_response.T[0]
out['predict_1']=test_response.T[1]
out['predict_2']=test_response.T[2]
out=out[['id','predict_0','predict_1','predict_2']]
out.to_csv('predictions.csv',index=False)

In [5]:
#transform cat labels into digits
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for column in join.columns:
    if column not in notCat:
        join[column]=le.fit_transform(join[column])
    
join.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fault_severity,id,location,event_type,severity_type,resource_type,log_feature,volume
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train,0,1,14121,148,26,1,2,237,19
train,1,1,14121,148,26,1,2,148,19
train,2,1,14121,148,27,1,2,237,19
train,3,1,14121,148,27,1,2,148,19
train,4,0,9320,1027,26,1,2,240,200
