In [1]:
from time import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# If we want to consider inf and -inf to be “NA” in computations, we can set 
pd.options.mode.use_inf_as_na = True

In [2]:
train_df = pd.read_csv('data/aps_failure_training_set.csv')
test_df = pd.read_csv('data/aps_failure_test_set.csv', skiprows=range(0, 20))

In [3]:
# hints to have a quick look on the data

# train_df.head()
# train_df.dtypes
# train_df.get_dtype_counts()
# train_df.head()
# train_df.describe()
# train_df.columns

In [4]:
# replace classes neg to 0, pos to 1
# alternative index accesing train_df.iloc[:,0]
train_df['class'] = train_df['class'].str.replace('neg','0')
train_df['class'] = train_df['class'].str.replace('pos','1')
cols = train_df.columns
train_df[cols] = train_df[cols].apply(pd.to_numeric, errors='coerce')

test_df['class'] = test_df['class'].str.replace('neg','0')
test_df['class'] = test_df['class'].str.replace('pos','1')
cols = test_df.columns
test_df[cols] = test_df[cols].apply(pd.to_numeric, errors='coerce')

test_df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,60,0.0,20.0,12.0,0.0,0.0,0.0,0.0,0.0,...,1098.0,138.0,412.0,654.0,78.0,88.0,0.0,0.0,0.0,0.0
1,0,82,0.0,68.0,40.0,0.0,0.0,0.0,0.0,0.0,...,1068.0,276.0,1620.0,116.0,86.0,462.0,0.0,0.0,0.0,0.0
2,0,66002,2.0,212.0,112.0,0.0,0.0,0.0,0.0,0.0,...,495076.0,380368.0,440134.0,269556.0,1315022.0,153680.0,516.0,0.0,0.0,0.0
3,0,59816,,1010.0,936.0,0.0,0.0,0.0,0.0,0.0,...,540820.0,243270.0,483302.0,485332.0,431376.0,210074.0,281662.0,3232.0,0.0,0.0
4,0,1814,,156.0,140.0,0.0,0.0,0.0,0.0,0.0,...,7646.0,4144.0,18466.0,49782.0,3176.0,482.0,76.0,0.0,0.0,0.0


In [5]:
# how many zeros data contains each column
print("Train data frame contains " + str((train_df == 0).sum().sum()) + " zeros")
print("Test data frame contains " + str((test_df == 0).sum().sum()) + " zeros")

Train data frame contains 3314175 zeros
Test data frame contains 879695 zeros


In [6]:
# how many NaN data contains each column
print("Train data frame contains " + str(train_df.isnull().sum().sum()) + " NaNs")
print("Test data frame contains " + str(test_df.isnull().sum().sum()) + " NaNs")

# returns "mask" dataframe  
# train_df.isnull()

Train data frame contains 850015 NaNs
Test data frame contains 228680 NaNs


Strategy: marking all zeroes as NaN

In [7]:
# cols = train_df.columns.drop('class')
# train_df[cols] = train_df[cols].replace(0, np.NaN)

# cols = test_df.columns.drop('class')
# test_df[cols] = test_df[cols].replace(0, np.NaN)

In [8]:
cols = train_df.columns.drop('class')
train_df[cols] = train_df[cols].replace(np.NaN, 0)

cols = test_df.columns.drop('class')
test_df[cols] = test_df[cols].replace(np.NaN, 0)

In [9]:
print(train_df.isnull().sum().sum())

0


In [10]:
# split dataframe into two arrays
cols = train_df.columns.drop('class')
X_train = train_df[cols].values
X_test = test_df[cols].values

# labels
y_train = train_df['class'].values
y_test = test_df['class'].values

In [11]:
# Create our imputer to replace missing values with the mean e.g.
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# strategy = {mean, median, most_frequent}

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp = imp.fit(X_train)
# Impute each train and test item, then predict
X_train_imp = imp.transform(X_train)
X_test_imp = imp.transform(X_test)

In [12]:
# # Random Forest Training
# # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# rfmodel = RandomForestClassifier(n_estimators=10)
# t0 = time()
# r_clf = rfmodel.fit(X_train_imp, y_train)
# print "training time:", round(time()-t0, 3), "s"

# # print(clf.feature_importances_)
# t0 = time()
# pred = r_clf.predict(X_test_imp)
# print "predict time:", round(time()-t0, 3), "s"

# acc = accuracy_score(pred, y_test)
# print "accuracy_score:", acc

# kfold = KFold(n_splits=3, random_state=7)
# result = cross_val_score(rfmodel, X_train_imp, y_train, cv=kfold, scoring='accuracy')
# print "kfold mean:", result.mean()

In [13]:
# KNeighborsClassifier
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# Can't handle missing data by itself 
# ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Maybe we can try replace all NaNs by zeros and then train the model

neigh = KNeighborsClassifier(n_neighbors=3)
t0 = time()
n_clf = neigh.fit(X_train, y_train) 
print "training time:", round(time()-t0, 3), "s"

# print(clf.feature_importances_)
t0 = time()
pred = n_clf.predict(X_test)
print "predict time:", round(time()-t0, 3), "s"

acc = accuracy_score(pred, y_test)
print "accuracy_score:", acc

kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(neigh, X_train_imp, y_train, cv=kfold, scoring='accuracy')
print "kfold mean:", result.mean()




training time: 3.006 s
predict time: 42.707 s
accuracy_score: 0.9848125
kfold mean: 0.9866999999999999


In [None]:
# https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles/35828995
# remove outliers using Percentiles

https://pandas.pydata.org/pandas-docs/version/0.22.0/missing_data.html

