### Machine Learning for Codon Usage Classification

In [18]:
import pandas as pd
import numpy as np 

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor

# Metric tools, and utility methods
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report,accuracy_score,recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV

### Preparing the Data

In [11]:
df = pd.read_csv('../raw_data/codon_usage.csv')

df['Kingdom'] = df['Kingdom'].astype('category')
df['Kingdom_Code'] = df['Kingdom'].cat.codes

codon_df = df.loc[:,df.columns[6:]]

# There is a string data on position 5063, so we need to drop it:
codon_df = codon_df.drop([5063])
codon_df = codon_df.apply(pd.to_numeric)
print(codon_df.head())

       UUC      UUA      UUG      CUU      CUC      CUA      CUG      AUU  \
0  0.01203  0.00050  0.00351  0.01203  0.03208  0.00100  0.04010  0.00551   
1  0.01357  0.00068  0.00678  0.00407  0.02849  0.00204  0.04410  0.01153   
2  0.02180  0.01357  0.01543  0.00782  0.01111  0.01028  0.01193  0.02283   
3  0.02245  0.01619  0.00992  0.01567  0.01358  0.00940  0.01723  0.02402   
4  0.01371  0.00767  0.03679  0.01380  0.00548  0.00473  0.02076  0.02716   

       AUC      AUA  ...      AGA      AGG      GAU      GAC      GAA  \
0  0.02005  0.00752  ...  0.01303  0.03559  0.01003  0.04612  0.01203   
1  0.02510  0.00882  ...  0.01696  0.03596  0.01221  0.04545  0.01560   
2  0.01604  0.01316  ...  0.01974  0.02489  0.03126  0.02036  0.02242   
3  0.02245  0.02507  ...  0.01410  0.01671  0.03760  0.01932  0.03029   
4  0.00867  0.01310  ...  0.01494  0.01734  0.04148  0.02483  0.03359   

       GAG      UAA      UAG      UGA  Kingdom_Code  
0  0.04361  0.00251  0.00050  0.00000       

  df = pd.read_csv('../raw_data/codon_usage.csv')


### Predicting Kingdom Category Based on Codon Frequency

In [12]:
X = codon_df.iloc[:,:-1]
y = codon_df['Kingdom_Code']

## Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, stratify = y)

### K Nearest Neighbor Model

In [14]:
knn = KNeighborsClassifier(n_neighbors=6)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

display(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))
print(knn.score(X_test, y_test))

array([[ 21,   3,   0,   0,   0,   0,   0,   0,   0,   1,   0],
       [  3, 564,   1,   0,   4,   0,   7,   0,   0,   5,   0],
       [  0,  12, 203,   0,   2,   0,  30,   0,   0,   9,  13],
       [  0,   0,   0, 101,   0,   0,   0,   4,   3,   0,   6],
       [  1,   8,   0,   0,  32,   0,   0,   0,   0,   3,   0],
       [  0,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   7,  12,   0,   1,   0, 479,   0,   0,   5,   1],
       [  0,   0,   0,  13,   0,   0,   0,  21,   2,   0,   0],
       [  0,   0,   0,  19,   0,   0,   0,   0,  24,   0,   0],
       [  0,  10,   4,   0,   0,   0,  21,   0,   0, 532,   0],
       [  0,   1,   5,   7,   0,   0,   1,   0,   2,   5, 394]])

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        25
           1       0.93      0.97      0.95       584
           2       0.90      0.75      0.82       269
           3       0.72      0.89      0.80       114
           4       0.82      0.73      0.77        44
           5       0.00      0.00      0.00         4
           6       0.89      0.95      0.92       505
           7       0.84      0.58      0.69        36
           8       0.77      0.56      0.65        43
           9       0.95      0.94      0.94       567
          10       0.95      0.95      0.95       415

    accuracy                           0.91      2606
   macro avg       0.78      0.74      0.76      2606
weighted avg       0.91      0.91      0.91      2606

0.9098234842670759


In [21]:
# Test Evaluation
test_df = pd.DataFrame([y_test.tolist(), y_pred.tolist()])
test_df = test_df.transpose()

test_df = test_df.rename(columns = { 0: "y_test", 1: "y_predict"})
test_df['Check'] = np.where(test_df['y_test'] == test_df['y_predict'], 1, 0)


total_test_case = test_df.Check.count()
correct_test_predict = test_df.value_counts('Check')[1]
incorrect_test_predict = test_df.value_counts('Check')[0]

print("Number of All Test Cases: {}".format(total_test_case))

print("Number of Correct Prediction: {}".format(correct_test_predict))

print("Number of Incorrect Prediction: {}".format(incorrect_test_predict))

NameError: name 'test' is not defined