In [1]:
from time import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

# If we want to consider inf and -inf to be “NA” in computations, we can set 
pd.options.mode.use_inf_as_na = True

In [2]:
train_df = pd.read_csv('data/aps_failure_training_set.csv', skiprows=range(0, 20))
test_df = pd.read_csv('data/aps_failure_test_set.csv', skiprows=range(0, 20))

In [4]:
# hints to have a quick look on the data

# test_df.head()
# train_df.dtypes
# train_df.get_dtype_counts()
# train_df.head()
# train_df.describe()
# train_df.columns

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,neg,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0
2,neg,66002,2,212,112,0,0,0,0,0,...,495076,380368,440134,269556,1315022,153680,516,0,0,0
3,neg,59816,na,1010,936,0,0,0,0,0,...,540820,243270,483302,485332,431376,210074,281662,3232,0,0
4,neg,1814,na,156,140,0,0,0,0,0,...,7646,4144,18466,49782,3176,482,76,0,0,0


In [5]:
# replace classes neg to 0, pos to 1
# alternative index accesing train_df.iloc[:,0]
train_df['class'] = train_df['class'].str.replace('neg','0')
train_df['class'] = train_df['class'].str.replace('pos','1')
cols = train_df.columns
train_df[cols] = train_df[cols].apply(pd.to_numeric, errors='coerce')

test_df['class'] = test_df['class'].str.replace('neg','0')
test_df['class'] = test_df['class'].str.replace('pos','1')
cols = test_df.columns
test_df[cols] = test_df[cols].apply(pd.to_numeric, errors='coerce')

test_df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,60,0.0,20.0,12.0,0.0,0.0,0.0,0.0,0.0,...,1098.0,138.0,412.0,654.0,78.0,88.0,0.0,0.0,0.0,0.0
1,0,82,0.0,68.0,40.0,0.0,0.0,0.0,0.0,0.0,...,1068.0,276.0,1620.0,116.0,86.0,462.0,0.0,0.0,0.0,0.0
2,0,66002,2.0,212.0,112.0,0.0,0.0,0.0,0.0,0.0,...,495076.0,380368.0,440134.0,269556.0,1315022.0,153680.0,516.0,0.0,0.0,0.0
3,0,59816,,1010.0,936.0,0.0,0.0,0.0,0.0,0.0,...,540820.0,243270.0,483302.0,485332.0,431376.0,210074.0,281662.0,3232.0,0.0,0.0
4,0,1814,,156.0,140.0,0.0,0.0,0.0,0.0,0.0,...,7646.0,4144.0,18466.0,49782.0,3176.0,482.0,76.0,0.0,0.0,0.0


In [6]:
# how many zeros data contains each column
print("Train data frame contains " + str((train_df == 0).sum().sum()) + " zeros")
print("Test data frame contains " + str((test_df == 0).sum().sum()) + " zeros")

Train data frame contains 3314175 zeros
Test data frame contains 879695 zeros


In [7]:
# how many NaN data contains each column
print("Train data frame contains " + str(train_df.isnull().sum().sum()) + " NaNs")
print("Test data frame contains " + str(test_df.isnull().sum().sum()) + " NaNs")

# returns "mask" dataframe  
# train_df.isnull()

Train data frame contains 850015 NaNs
Test data frame contains 228680 NaNs


In [None]:
train_df.su

Strategy: marking all zeroes as NaN

In [14]:
# cols = train_df.columns.drop('class')
# train_df[cols] = train_df[cols].replace(0, np.NaN)

# cols = test_df.columns.drop('class')
# test_df[cols] = test_df[cols].replace(0, np.NaN)

In [15]:
# cols = train_df.columns.drop('class')
# train_df[cols] = train_df[cols].replace(np.NaN, 0)

# cols = test_df.columns.drop('class')
# test_df[cols] = test_df[cols].replace(np.NaN, 0)


In [16]:
print(train_df.isnull().sum().sum())

850015


In [32]:
# split dataframe into two arrays
cols = train_df.columns.drop('class')
X_train = train_df[cols].values
X_test = test_df[cols].values

# labels
y_train = train_df['class'].values
y_test = test_df['class'].values

[[7.66980000e+04            nan 2.13070644e+09 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.30580000e+04            nan 0.00000000e+00 ... 1.50000000e+03
  0.00000000e+00 0.00000000e+00]
 [4.10400000e+04            nan 2.28000000e+02 ... 5.14000000e+02
  0.00000000e+00 0.00000000e+00]
 ...
 [1.12000000e+02 0.00000000e+00 2.13070643e+09 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.02920000e+04            nan 2.13070643e+09 ... 3.88422000e+05
  0.00000000e+00 0.00000000e+00]
 [4.02220000e+04            nan 6.98000000e+02 ... 1.58000000e+02
  0.00000000e+00 0.00000000e+00]]


In [33]:
# Create our imputer to replace missing values with the mean e.g.
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# strategy = {mean, median, most_frequent}

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp = imp.fit(X_train)
# Impute each train and test item, then predict
X_train_imp = imp.transform(X_train)
X_test_imp = imp.transform(X_test)

pd.DataFrame(X_train_imp)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,76698.0,0.0,2.130706e+09,280.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,33058.0,0.0,0.000000e+00,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,41040.0,0.0,2.280000e+02,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,12.0,0.0,7.000000e+01,66.0,0.0,10.0,0.0,0.0,0.0,318.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,60874.0,0.0,1.368000e+03,458.0,0.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0
5,38312.0,0.0,2.130706e+09,218.0,0.0,0.0,0.0,0.0,0.0,0.0,...,388574.0,288278.0,900430.0,300412.0,1534.0,338.0,856.0,0.0,0.0,0.0
6,14.0,0.0,6.000000e+00,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,168.0,48.0,60.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0
7,102960.0,0.0,2.130706e+09,116.0,0.0,0.0,0.0,0.0,0.0,0.0,...,715518.0,384948.0,915978.0,1052166.0,1108672.0,341532.0,129504.0,7832.0,0.0,0.0
8,78696.0,0.0,0.000000e+00,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,699290.0,362510.0,1190028.0,1012704.0,160090.0,63216.0,41202.0,4.0,0.0,0.0
9,153204.0,0.0,1.820000e+02,126.0,0.0,0.0,0.0,0.0,0.0,11804.0,...,129862.0,26872.0,34044.0,22472.0,34362.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# # Random Forest Training
# # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# rfmodel = RandomForestClassifier(n_estimators=10)
# t0 = time()
# r_clf = rfmodel.fit(X_train_imp, y_train)
# print "training time:", round(time()-t0, 3), "s"

# # print(clf.feature_importances_)
# t0 = time()
# pred = r_clf.predict(X_test_imp)
# print "predict time:", round(time()-t0, 3), "s"

# acc = accuracy_score(pred, y_test)
# print "accuracy_score:", acc

# kfold = KFold(n_splits=3, random_state=7)
# result = cross_val_score(rfmodel, X_train_imp, y_train, cv=kfold, scoring='accuracy')
# print "kfold mean:", result.mean()

In [2]:
# KNeighborsClassifier
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# Can't handle missing data by itself 
# ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Maybe we can try replace all NaNs by zeros and then train the model

#neigh = KNeighborsClassifier(n_neighbors=3)
#t0 = time()
#n_clf = neigh.fit(X_train, y_train) 
#print "training time:", round(time()-t0, 3), "s"

# print(clf.feature_importances_)
#t0 = time()
#pred = n_clf.predict(X_test)
#print "predict time:", round(time()-t0, 3), "s"

#acc = accuracy_score(pred, y_test)
#print "accuracy_score:", acc

#kfold = KFold(n_splits=3, random_state=7)
#result = cross_val_score(neigh, X_train_imp, y_train, cv=kfold, scoring='accuracy')
#print "kfold mean:", result.mean()




In [None]:
#SVM
#http://scikit-learn.org/stable/modules/svm.html

clf = svm.SVC()
clf.fit(X_train, y_train)

#clf.predict(X_test, y_test)

In [None]:
# https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles/35828995
# remove outliers using Percentiles

https://pandas.pydata.org/pandas-docs/version/0.22.0/missing_data.html

