<a href="https://colab.research.google.com/github/aminejarraya/CRDIO-challenge-of-aicrowd/blob/main/CRDIO_Challenge_on_AIcrowd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook we implement models for training of CRDIO Challenge on [AIcrowd]("https://www.aicrowd.com/challenges/ai-blitz-4/problems/crdio)

#Import packages

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib
from sklearn import svm

#Download Data

In [None]:
#The first step is to download out train test data. We will be training a model on the train data and make predictions on test data. We submit our predictions
!rm -rf data
!mkdir data 
!wget https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/crdio/v0.1/test.csv
!wget https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/crdio/v0.1/train.csv
!mv test.csv data/test.csv
!mv train.csv data/train.csv

--2020-11-04 17:25:08--  https://datasets.aicrowd.com/default/aicrowd-practice-challenges/public/crdio/v0.1/test.csv
Resolving datasets.aicrowd.com (datasets.aicrowd.com)... 35.189.208.115
Connecting to datasets.aicrowd.com (datasets.aicrowd.com)|35.189.208.115|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://s3.us-west-002.backblazeb2.com/aicrowd-practice-challenges/public/crdio/v0.1/test.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=002ae2491b744be0000000002%2F20201104%2Fus-west-002%2Fs3%2Faws4_request&X-Amz-Date=20201104T172509Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=f711348c0f6e4854fbac47f610d75225106a4262e58e184e47714c0aa52b8171 [following]
--2020-11-04 17:25:09--  https://s3.us-west-002.backblazeb2.com/aicrowd-practice-challenges/public/crdio/v0.1/test.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=002ae2491b744be0000000002%2F20201104%2Fus-west-002%2Fs3%2Faws4_request&X-Amz-Date=20201104T172509Z&X-Amz-Expir

#Load Data

In [None]:
#Load data using pandas library
data = pd.read_csv('/content/data/train.csv')

In [None]:
#Shuffle the data
data= data.iloc[np.random.permutation(len(data))]

In [None]:
#Display the first 5 lines of our data
data.head()

Unnamed: 0,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,135.0,135.0,8.0,2.0,7.0,60.0,1.0,0.0,5.2,0.0,0.0,0.0,0.0,40.0,123.0,163.0,0.0,0.0,148.0,142.0,145.0,7.0,0.0,1
1,133.0,133.0,0.0,4.0,6.0,60.0,2.5,0.0,0.0,4.0,0.0,0.0,0.0,65.0,83.0,148.0,2.0,0.0,131.0,122.0,130.0,14.0,1.0,1
2,135.0,135.0,10.0,2.0,4.0,60.0,0.9,0.0,0.5,0.0,0.0,0.0,0.0,39.0,124.0,163.0,1.0,0.0,148.0,143.0,146.0,6.0,0.0,1
3,120.0,120.0,3.0,1.0,2.0,56.0,0.5,0.0,8.8,0.0,0.0,0.0,0.0,35.0,106.0,141.0,1.0,0.0,125.0,125.0,126.0,3.0,0.0,1
4,132.0,132.0,6.0,0.0,6.0,27.0,1.7,0.0,11.3,0.0,0.0,0.0,0.0,122.0,54.0,176.0,8.0,0.0,150.0,146.0,149.0,18.0,1.0,1


#Split Data into Train and Validation

In [None]:
#We will split data into train and validation ; starting with using "iloc" indexer to select the first 23 features as input and the last column as output
X= data.iloc[:,:23].values
Y=data.iloc[:,23].values
#Display both shapes of X(input) and Y(output)
print(X.shape)
print(Y.shape)

(1700, 23)
(1700,)


In [None]:
#We will normalize X ; minimize its interval in order to obtain better results
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
#The workaround this is to split the given training data into training and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

#Training Phase

In [None]:
#We have fixed our data and now we are ready to train our model.
#There are a ton of classifiers to choose from some being Logistic Regression, SVM, Random Forests, Decision Trees
#Starting with SVM 
model = svm.SVC(C=10000000, gamma=0.3)

In [None]:
#Train the classifier
#Fit the data into our selected model
model.fit(X_train, y_train)

SVC(C=10000000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.3, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#Testing Phase

In [None]:
#Now we predict using our trained model on the validation set we created and evaluate our model on unforeseen data
pred = model.predict(X_test)

#Evaluate the Performance

In [None]:
#We have used basic metric to quantify the performance of our model.
from sklearn import metrics

In [None]:
#As we can see the model is 91% good.
print('The accuracy of the SVM is:',metrics.accuracy_score(pred,y_test))

The accuracy of the SVM is: 0.9147058823529411


#Prepare submission file

In [None]:
#We will load the unforeseen data
test = pd.read_csv("/content/data/test.csv")

In [None]:
#Normalize its inputs
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaler.fit(test)
test = scaler.transform(test)

In [None]:
#Make predictions
submission = model.predict(t)

In [None]:
submission

array([1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 2, 2,
       1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1,
       2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2,
       1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
#Finally we prepare the needed submission file for the competition
submission = pd.DataFrame(submission)
submission.to_csv('submission.csv',header=['NSP'],index=False)

#2nd Approach

GridSearchCV with KNeighborsClassifier

In [None]:
#GridsearchCV is a library function that is a member of sklearn's model_selection package.
#It helps to loop through predefined hyperparameters and fit your estimator(model) on your training set
from sklearn.model_selection import GridSearchCV
model = KNeighborsClassifier(n_neighbors=8)
parameters = {'weights': ['uniform', 'distance'] , 'p': [1,2,3,4,5,6,7,8] ,'algorithm' : ['ball_tree','kd_tree' ,'brute', 'auto'] , 'leaf_size' : [10,20,30,40,50,60,70,80,90,100,120]}
grid_search = GridSearchCV(estimator=model,param_grid= parameters,scoring = 'accuracy', verbose=1)
grid_search.fit(X_train , y_train)
#After fitting the data into our model , we will display the best score we get
print("***********************************")
grid_search.best_score_

Fitting 5 folds for each of 704 candidates, totalling 3520 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


***********************************


[Parallel(n_jobs=1)]: Done 3520 out of 3520 | elapsed: 13.8min finished


0.9095588235294118

In [None]:
#We can also know the best parameters that belong to that score using the function below.
grid_search.best_params_

{'algorithm': 'ball_tree', 'leaf_size': 10, 'p': 1, 'weights': 'distance'}

In [None]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaler.fit(test)
test = scaler.transform(test)

In [None]:
#We will apply the same process that we did before in order to prepare the needed submission file
preds = grid_search.predict(test)

In [None]:
submission = pd.DataFrame(preds)
submission.to_csv('data/submission.csv',header=['NSP'],index=False)

Finally, you just need to download the file that is added to your content lately and submit it.