In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB

In [None]:
human_data = pd.read_table('../data/human_data.txt')
chimp_data = pd.read_table('../data/chimp_data.txt')
dog_data = pd.read_table('../data/dog_data.txt')

In [None]:
human_data.head()
chimp_data.head()
dog_data.head()

In [None]:
# fonction pour convertir les chaînes de séquence en mots k-mers, taille par défaut = 6 (mots hexamères)
def getKmers(sequence, size = 6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [None]:
human_data['words'] = human_data.apply(lambda x: getKmers(x['sequence']), axis = 1)
human_data = human_data.drop('sequence', axis = 1)
chimp_data['words'] = chimp_data.apply(lambda x: getKmers(x['sequence']), axis = 1)
chimp_data = chimp_data.drop('sequence', axis = 1)
dog_data['words'] = dog_data.apply(lambda x: getKmers(x['sequence']), axis = 1)
dog_data = dog_data.drop('sequence', axis = 1)

In [None]:
human_data.head()
chimp_data.head()
dog_data.head()

In [None]:
human_texts = list(human_data['words'])
for item in range(len(human_texts)):
    human_texts[item] = ' '.join(human_texts[item])
y_data = human_data.iloc[:, 0].values
print(human_texts[2])

In [None]:
chimp_texts = list(chimp_data['words'])
for item in range(len(chimp_texts)):
    chimp_texts[item] = ' '.join(chimp_texts[item])
y_chimp = chimp_data.iloc[:, 0].values   

In [None]:
dog_texts = list(dog_data['words'])
for item in range(len(dog_texts)):
    dog_texts[item] = ' '.join(dog_texts[item])
y_dog = dog_data.iloc[:, 0].values   

In [None]:
cv = CountVectorizer(ngram_range=(4,4))

X_human = cv.fit_transform(human_texts)
X_chimp = cv.transform(chimp_texts)
X_dog = cv.transform(dog_texts)

print(X_human.shape)
print(X_chimp.shape)
print(X_dog.shape)

In [None]:
human_data['class'].value_counts().sort_index().plot.bar()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_human, 
                                                    y_data, 
                                                    test_size = 0.20, 
                                                    random_state = 42)
print(X_train.shape)
print(X_test.shape)

In [None]:
# Le paramètre alpha est issu du GridSearch précédent
classifier = MultinomialNB(alpha = 0.1)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))