# Challenge
### Statistical Learning

Author: Anas Barakat

The last cell contains the code to obtain the best score. 
Cells before show all the methods tested on the data before finding the optimal score.

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer


from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

#for neural network models 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD

from sklearn.neural_network import MLPClassifier

from time import time 

Using TensorFlow backend.


In [2]:
# Performance criterion
def compute_pred_score(y_true, y_pred):
    y_pred_unq =  np.unique(y_pred)
    for i in y_pred_unq:
        if((i != -1) & (i!= 1) & (i!= 0) ):
            raise ValueError('The predictions can contain only -1, 1, or 0!')
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

X_train_fname = 'training_templates.csv'
y_train_fname = 'training_labels.txt'
X_test_fname  = 'testing_templates.csv'
X_train = pd.read_csv(X_train_fname, sep=',', header=None).values
X_test  = pd.read_csv(X_test_fname,  sep=',', header=None).values
y_train = np.loadtxt(y_train_fname, dtype=np.int)

In [None]:
# Random Forest Attempt 
#3 #Improve the predictive power of the model by tuning the parameters/ increasing number of estimators 
#time / quality tradeoff, the bigger the better 
n_estimators = 500

# Feature selection after using 'random forest's 'feature_importance'
#X_train = X_train[:, [19, 34, 44, 57, 69, 74, 77, 79, 86, 87, 102, 116]]
#X_test  = X_test[:, [19, 34, 44, 57, 69, 74, 77, 79, 86, 87, 102, 116]]

#X_train = X_train[:,[1, 6, 13, 14, 19, 25, 32, 34, 40, 44, 49, 52, 57, 65, 67, 69, 74, 76, 77, 79, 86, 87, 102, 105, 106, 108, 116, 121, 122, 126]
#]
#X_test = X_test[:,[1, 6, 13, 14, 19, 25, 32, 34, 40, 44, 49, 52, 57, 65, 67, 69, 74, 76, 77, 79, 86, 87, 102, 105, 106, 108, 116, 121, 122, 126]
#]

# RF fitting
model = RandomForestClassifier(n_estimators=n_estimators, max_features = 'log2',max_depth = None,n_jobs = -1, min_samples_leaf = 1)
clfRF = model.fit(X_train, y_train)

# Prediction
y_pred_train =  clfRF.predict(X_train)
#print(clfRF.feature_importances_)
#plt.plot(clfRF.feature_importances_)

# feature selection 
#importantFeatures = []
#for k in range(128):
#    if clfRF.feature_importances_[k]>0.010:
#        importantFeatures += [k]
#print(importantFeatures)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

# Prediction
y_pred = clfRF.predict(X_test)

proba = clfRF.predict_proba(X_test)
print(proba[0:10,])

# predict 0 when the probabilities of predicting -1 and 1 are close 
for k in range(1,y_pred.shape[0]):
    if 0.35<proba[k,1]<0.65:
        y_pred[k] = 0

np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [None]:
#Grid search to find optimal parameters 

#finding max_features optimal parameter by CV 
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    #'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
    'min_samples_leaf': [1,10,20,50,100]
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

In [None]:
#gradient boosting classifier - as seen in the lecture 
n_estimators = 200


# RF fitting
model = GradientBoostingClassifier(n_estimators=n_estimators, max_features = "log2", min_samples_leaf=1)
clfGB = model.fit(X_train, y_train)

# Prediction
y_pred_train =  clfGB.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

# Prediction
y_pred = clfGB.predict(X_test)

proba = clfGB.predict_proba(X_test)
print(proba[0:10,])

# predict 0 when the probabilities of predicting -1 and 1 are close 
#
for k in range(1,y_pred.shape[0]):
    if 0.4<proba[k,1]<0.6:
        y_pred[k] = 0

np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [None]:
#score 0.498352165725
# SVC fitting
model = LinearSVC(C=25)
# puis SVC avec noyau polynomial et rbf 

#X_train = X_train[:,[1, 6, 13, 14, 19, 25, 32, 34, 40, 44, 49, 52, 57, 65, 67, 69, 74, 76, 77, 79, 86, 87, 102, 105, 106, 108, 116, 121, 122, 126]
#]
#X_test = X_test[:,[1, 6, 13, 14, 19, 25, 32, 34, 40, 44, 49, 52, 57, 65, 67, 69, 74, 76, 77, 79, 86, 87, 102, 105, 106, 108, 116, 121, 122, 126]
#]

clfSVM = model.fit(X_train, y_train)

# Prediction
y_pred_train =  clfSVM.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

# Prediction
y_pred = clfSVM.predict(X_test)

dist = clfSVM.decision_function(X_test)
print(dist[0:10,])

# predict 0 when the probabilities of predicting -1 and 1 are close/ thresholding 
#
for k in range(1,y_pred.shape[0]):
    if dist[k]<=0.25:
        y_pred[k] = 0

np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [1]:
#neural networks with keras 
from keras import optimizers
# define vars
input_num_units = 128
hidden1_num_units = 40
hidden2_num_units = 40
hidden3_num_units = 40
hidden4_num_units = 40
hidden5_num_units = 40
hidden6_num_units = 40
output_num_units = 1

#epochs = 5 #score 0.265301318267
#epochs = 10 #score  0.243644067797

epochs = 10

batch_size = 20

dropout_ratio = 0.2

weight_constraint = 3

# create model
model = Sequential([
    Dense(output_dim=hidden1_num_units, input_dim=input_num_units, activation='relu', kernel_constraint=maxnorm(weight_constraint)),
        Dropout(dropout_ratio),
    Dense(output_dim=hidden2_num_units, input_dim=hidden1_num_units, activation='relu', kernel_constraint=maxnorm(weight_constraint)),
        Dropout(dropout_ratio),
        
    Dense(output_dim=hidden3_num_units, input_dim=hidden2_num_units, activation='relu', kernel_constraint=maxnorm(weight_constraint)),
        Dropout(dropout_ratio),
    Dense(output_dim=hidden4_num_units, input_dim=hidden3_num_units, activation='relu', kernel_constraint=maxnorm(weight_constraint)),
        Dropout(dropout_ratio),
    Dense(output_dim=hidden5_num_units, input_dim=hidden4_num_units, activation='relu', kernel_constraint=maxnorm(weight_constraint)),
        Dropout(dropout_ratio),
        
    Dense(output_dim=output_num_units, input_dim=hidden5_num_units, activation='sigmoid'),
])

# compile the model with necessary attributes
#sgd = SGD(lr=0.001, momentum=0.8, decay=0.0, nesterov=False)
rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics=['accuracy'])

for k in range(1,y_train.shape[0]):
    if y_train[k]==-1:
        y_train[k] = 0


print("starting fitting")
# Fit the model
model.fit(X_train, y_train, nb_epoch= epochs, batch_size= batch_size,  verbose=1)
print("fitting finished")

# calculate predictions
print("calculate predictions xtest")
y_pred = model.predict_classes(X_test)

print(y_pred)

for k in range(1,y_pred.shape[0]):
    if y_pred[k]==0:
        y_pred[k] = -1

print("calculate predictions probas")
proba = model.predict_proba(X_test)

# predict 0 when the probabilities of predicting -1 and 1 are close 
#
for k in range(1,y_pred.shape[0]):
    if 0.4<proba[k]<0.6:
        y_pred[k] = 0

np.savetxt('y_pred2.txt', y_pred, fmt='%d')

Using TensorFlow backend.


NameError: name 'Sequential' is not defined

In [3]:
# Trying bagging 
#Multi Layer Perceptron 
#0.175376647834
ls = 200
#ls2 = 150 #100
#ls3 = 75 #50
#ls1(250) ls2(100) ls3 (50)

time1 = time()
clfMLP = MLPClassifier(solver='adam',alpha=7e-8, hidden_layer_sizes=(ls), random_state=1, max_iter=400)
#learning_rate_init = 1e-6
#warm_start =True, learning_rate ='adaptive'

clfMLP = BaggingClassifier(clfMLP, n_estimators= 50, max_features=0.8, n_jobs=-1)

clfMLP.fit(X_train, y_train)



y_pred = clfMLP.predict(X_test)

# Prediction
y_pred_train =  clfMLP.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

proba = clfMLP.predict_proba(X_test)
print(proba[0:10,])

for k in range(1,y_pred.shape[0]):
    if 0.001<proba[k,1]<0.999:
        y_pred[k] = 0
        
time2 = time()

print("execution time:", time2 - time1)
np.savetxt('y_pred.txt', y_pred, fmt='%d')

Score sur le train : 0.0013257575757575758
[[  2.55640227e-02   9.74435977e-01]
 [  9.99999353e-01   6.47292669e-07]
 [  2.88919041e-06   9.99997111e-01]
 [  1.42858500e-07   9.99999857e-01]
 [  9.99994155e-01   5.84519747e-06]
 [  5.19577260e-05   9.99948042e-01]
 [  9.67327701e-01   3.26722991e-02]
 [  9.99999828e-01   1.72037433e-07]
 [  1.66336829e-04   9.99833663e-01]
 [  5.19927051e-07   9.99999480e-01]]
execution time: 3499.8385169506073


In [None]:
ls = 200
#ls2 = 150 #100
#ls3 = 75 #50
#ls1(250) ls2(100) ls3 (50)

time1 = time()
clfMLP = MLPClassifier(solver='adam',alpha=7e-8, hidden_layer_sizes=(ls), random_state=1, max_iter=400)
#learning_rate_init = 1e-6
#warm_start =True, learning_rate ='adaptive'

clfMLP.fit(X_train, y_train)



y_pred = clfMLP.predict(X_test)

# Prediction
y_pred_train =  clfMLP.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

proba = clfMLP.predict_proba(X_test)
print(proba[0:10,])

for k in range(1,y_pred.shape[0]):
    if 0.001<proba[k,1]<0.999:
        y_pred[k] = 0
        
time2 = time()

print("execution time:", time2 - time1)
np.savetxt('y_pred.txt', y_pred, fmt='%d')