insert 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn import model_selection


create a data frame

In [2]:
df = pd.read_csv("dna.csv")
classes = df.loc[:, 'class']

create X and Y datasets for training


In [3]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

split the data into training and testing datasets

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=1)

In [5]:
scoring = 'accuracy'

For this task we choose 2 models Nearest Neighbors and Decision Tree.

In [8]:
names = ["Nearest Neighbors",
         "Decision Tree", "Random Forest"]

models = [

    KNeighborsClassifier(),
    DecisionTreeClassifier(),
]

models = zip(names, models)
result = []

Fiting the models and evaluating

In [9]:
for name, model in models:
    cv_results = model_selection.cross_val_score(model, X_train, y_train, scoring=scoring)
    result.append(cv_results)
    print(name + ': ' + str(cv_results.mean()) + ' ' + str(cv_results.std()))
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print('Validation Accuracy', round(accuracy_score(y_test, predict), 2))
    print("Confusion Matrix/n", classification_report(y_test, predict))

Nearest Neighbors: 0.7789891494083490.014315324981288886
Validation Accuracy 0.77
Confusion Matrix/n               precision    recall  f1-score   support

           1       0.69      0.81      0.74       198
           2       0.65      0.91      0.76       188
           3       0.94      0.68      0.79       411

    accuracy                           0.77       797
   macro avg       0.76      0.80      0.76       797
weighted avg       0.81      0.77      0.77       797

Decision Tree: 0.90037981456628320.01109152288720721
Validation Accuracy 0.9
Confusion Matrix/n               precision    recall  f1-score   support

           1       0.87      0.89      0.88       198
           2       0.83      0.90      0.86       188
           3       0.96      0.91      0.94       411

    accuracy                           0.90       797
   macro avg       0.89      0.90      0.89       797
weighted avg       0.91      0.90      0.91       797



let's choose parameters for KNN classifier (for my machine it was not taking more than a minute)

In [14]:
parameters = {
     "n_neighbors": range(1, 50),
     "weights": ["uniform", "distance"],
}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)


GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 50),
                         'weights': ['uniform', 'distance']})

In [13]:
print("KNN parameters", gridsearch.best_params_)


KNN parameters {'n_neighbors': 10, 'weights': 'distance'}


Substitute parameters to the model and measure the performance.

In [19]:
KNN = KNeighborsClassifier(n_neighbors=14, weights='distance')
KNN_cv_results = model_selection.cross_val_score(KNN, X_train, y_train, scoring=scoring)
print("KNeighborsClassifier" + ': ' + str(KNN_cv_results.mean()) + ' '+ str(KNN_cv_results.std()))
KNN.fit(X_train, y_train)
KNN_predict = KNN.predict(X_test)
print('Validation Accuracy', round(accuracy_score(y_test, KNN_predict), 2))
print("Confusion Matrix/n", classification_report(y_test, KNN_predict))

KNeighborsClassifier: 0.8518161802759577 0.009378235793339263
Validation Accuracy 0.84
Confusion Matrix/n               precision    recall  f1-score   support

           1       0.80      0.84      0.82       198
           2       0.73      0.96      0.83       188
           3       0.96      0.79      0.87       411

    accuracy                           0.84       797
   macro avg       0.83      0.87      0.84       797
weighted avg       0.86      0.84      0.85       797



It could be summarized the classifier performed more accurate after tuning parameters. 

Let's choose parameters for DT classifier (for some reason it was not compiling with min_samples_split and min_samples_leaf)

In [20]:
DT_parameters = {"criterion": ['gini', 'entropy'],
                 "max_depth": range(1, 10),
                 # "min_samples_split": range(1, 10),
                 # "min_samples_leaf": range(1, 5),
}
DT_grid = GridSearchCV(DecisionTreeClassifier(), DT_parameters)
DT_grid.fit(X_train, y_train)
print(DT_grid.best_params_)


{'criterion': 'entropy', 'max_depth': 9}


Substitute parameters to the model and measure the performance. 

In [21]:
DT = DecisionTreeClassifier(max_depth=7, criterion='entropy')
DT_cv_results = model_selection.cross_val_score(DT, X_train, y_train, scoring=scoring)
print("DecisionTreeClassifier" + ': ' + str(cv_results.mean()) + ' ' + str(cv_results.std()))
DT.fit(X_train, y_train)
DT_predict = DT.predict(X_test)
print('Validation Accuracy', round(accuracy_score(y_test, DT_predict), 2))
print("Confusion Matrix/n", classification_report(y_test, DT_predict))

DecisionTreeClassifier: 0.9003798145662832 0.01109152288720721
Validation Accuracy 0.93
Confusion Matrix/n               precision    recall  f1-score   support

           1       0.90      0.92      0.91       198
           2       0.87      0.95      0.91       188
           3       0.98      0.92      0.95       411

    accuracy                           0.93       797
   macro avg       0.91      0.93      0.92       797
weighted avg       0.93      0.93      0.93       797



We can conclude that DecisionTreeClassifier is more accurate for this task. However, tuning of parameters did not affect the accuracy much but, it did increase precision. 