# 1. Data and Library Imports 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/uomcse2021/cse_DS_Intro3TRAIN.csv
/kaggle/input/uomcse2021/cse_DS_Intro3TEST.csv


In [3]:
dataset           = pd.read_csv('/kaggle/input/uomcse2021/cse_DS_Intro3TRAIN.csv')
dataset           = dataset.set_index('ID')
dataset_eval      = pd.read_csv('/kaggle/input/uomcse2021/cse_DS_Intro3TEST.csv')
dataset_eval      = dataset_eval.set_index('ID')

# 2. Preparing dataset

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# 3. Splitting Dataset

In [5]:
X_train = X
y_train = y

In [6]:
#Appling feature scaling as this involves knn
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

# 4. Training

## 4.1 Random Forest

### 4.1.1 Train Set Cross Validation Score

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)


In [8]:
print(accuracies)
print('mean: ' + str(accuracies.mean())+' std: '+ str(accuracies.std()) )

[0.924 0.932 0.874 0.828 0.892 0.884 0.892 0.902 0.892 0.832]
mean: 0.8852 std: 0.032164576788759426


## 4.2 Catboost

### 4.2.1 Install

In [9]:
!pip install catboost



### 4.2.2 Train Set Cross Validation Score

In [10]:
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
catB = CatBoostClassifier(verbose=0)
accuracies = cross_val_score(estimator = catB, X = X_train, y = y_train, cv = 10)

In [11]:
print('mean: ' + str(accuracies.mean())+' std: '+ str(accuracies.std()) )
print(accuracies)

mean: 0.8896000000000001 std: 0.03197248817342813
[0.93  0.924 0.888 0.836 0.912 0.876 0.892 0.912 0.894 0.832]


## 4.3 KNN 

### 4.3.1 Crossvalidation

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='distance')
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)


In [13]:
print(accuracies)
print('mean: ' + str(accuracies.mean())+' std: '+ str(accuracies.std()) )

[0.912 0.91  0.86  0.828 0.894 0.876 0.86  0.892 0.896 0.854]
mean: 0.8782 std: 0.02585265943766717


## 4.4 VotingClassifier

### 4.4.1 CrossValidation

In [14]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

eclf1 = VotingClassifier(estimators=[('KNN', knn),  ('RandomForest',rf), ('CatBoost', catB)], voting='soft')
accuracies = cross_val_score(estimator = eclf1, X = X_train, y = y_train, cv = 10)


In [15]:
print(accuracies)
print('mean: ' + str(accuracies.mean())+' std: '+ str(accuracies.std()) )

[0.93  0.93  0.878 0.828 0.91  0.878 0.874 0.922 0.898 0.854]
mean: 0.8901999999999999 std: 0.03220496856076717
