### IMPORTATIONS

In [42]:
from time import time
import os
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from itertools import product

from kmerlib.running_window import *
from kmerlib.spectrum import *
from kmerlib.tools import *
from utils.term_colors import *

# plot config
sns.set()
sns.set_context("talk")

In [2]:
%matplotlib notebook
#%matplotlib tk

In [3]:
# Plot colors
blue = "#089FB2"
green = "#C0C94F"
orange = "#FC933E"
red = "#F23957"

### CHARGEMENT DES DONNÉES

In [11]:
df = pd.read_csv("data/genes_dataset/10orgs_500genes_01.csv")
df = pd.concat([df, pd.read_csv("data/genes_dataset/10orgs_500genes_02.csv")])
df.reset_index(drop=True, inplace=True)

In [12]:
df

Unnamed: 0,Organism,Clade,AA,AT,AG,AC,TA,TT,TG,TC,...,CCCTG,CCCTC,CCCGA,CCCGT,CCCGG,CCCGC,CCCCA,CCCCT,CCCCG,CCCCC
0,Pyrococcus_horikoshii,archaea,0.159445,0.086655,0.103986,0.031196,0.081456,0.062392,0.041594,0.046794,...,0.000000,0.000000,0.000000,0.001742,0.000000,0.000000,0.001742,0.000000,0.000000,0.000000
1,Pyrococcus_horikoshii,archaea,0.060209,0.089005,0.028796,0.044503,0.060209,0.104712,0.052356,0.133508,...,0.002639,0.002639,0.000000,0.000000,0.002639,0.000000,0.000000,0.000000,0.002639,0.000000
2,Pyrococcus_horikoshii,archaea,0.100102,0.070480,0.096016,0.060266,0.072523,0.057201,0.049030,0.048008,...,0.000000,0.000000,0.000000,0.000000,0.001025,0.000000,0.000000,0.000000,0.000000,0.000000
3,Pyrococcus_horikoshii,archaea,0.058577,0.080335,0.048536,0.041841,0.083682,0.123013,0.055230,0.100418,...,0.000839,0.000839,0.000000,0.000000,0.000000,0.000000,0.000839,0.001678,0.000000,0.000839
4,Pyrococcus_horikoshii,archaea,0.100763,0.087023,0.077863,0.029008,0.080916,0.103817,0.054962,0.070229,...,0.001534,0.000000,0.001534,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Pyrococcus_horikoshii,archaea,0.079787,0.100000,0.047872,0.035106,0.079787,0.124468,0.069149,0.062766,...,0.001067,0.000000,0.000000,0.000000,0.000000,0.000000,0.002134,0.000000,0.000000,0.000000
6,Pyrococcus_horikoshii,archaea,0.079780,0.042641,0.129298,0.042641,0.053645,0.050894,0.048143,0.053645,...,0.000000,0.001381,0.001381,0.000000,0.002762,0.000000,0.001381,0.000000,0.001381,0.000000
7,Pyrococcus_horikoshii,archaea,0.118608,0.080256,0.107955,0.046165,0.083807,0.058949,0.049006,0.037642,...,0.000000,0.000712,0.000712,0.000712,0.002135,0.000712,0.000000,0.000712,0.000712,0.000712
8,Pyrococcus_horikoshii,archaea,0.121780,0.074941,0.114754,0.039813,0.070258,0.074941,0.058548,0.028103,...,0.002358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,Pyrococcus_horikoshii,archaea,0.123218,0.082485,0.083503,0.041752,0.069246,0.086558,0.061100,0.064155,...,0.000000,0.001021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
org_list = set(df["Organism"])
print("Nombre d'organismes :", len(org_list))

Nombre d'organismes : 20


In [20]:
Ks = (2, 3, 4, 5)
words = dict()
for k in Ks:
    words[k] = ["".join(w) for w in product("ATCG", repeat=k)]

### RÉGRESSION LOGISTIC

In [43]:
def log_reg(df, Ks, C=1.0, solver="newton-cg", pca_components=None):
    t_start = time()
    w = list()
    for k in Ks:
        w.extend(words[k])
    X = df[w].values
    y = df["Organism"]
    
    if pca_components is not None:
        pca = PCA(n_components=pca_components)
        pca.fit(X)
        X = pca.transform(X)
    
    scores = list()
    kf = KFold(10, shuffle=True)
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        lg = LogisticRegression(solver=solver, C=C, multi_class="ovr", n_jobs=3)
        lg.fit(X_train, y_train)
        
        scores.append(lg.score(X_test, y_test))
    print("For K={}, mean accuracy = {:.3f}".format(Ks, np.mean(scores)))
    t_stop = time()
    return np.mean(scores), t_stop - t_start

In [None]:
scores = dict()
t = dict()

In [32]:
scores[2], t[2] = log_reg(df, (2,))

For K=(2,), mean accuracy = 0.580


In [40]:
log_reg(df, (3,))

C = 1e-06 | Solver = newton-cg
For K=(3,), mean accuracy = 0.641
C = 1e-06 | Solver = lbfgs
For K=(3,), mean accuracy = 0.638
C = 1e-06 | Solver = sag
For K=(3,), mean accuracy = 0.643
C = 0.0001 | Solver = newton-cg
For K=(3,), mean accuracy = 0.646
C = 0.0001 | Solver = lbfgs
For K=(3,), mean accuracy = 0.642
C = 0.0001 | Solver = sag
For K=(3,), mean accuracy = 0.639
C = 0.001 | Solver = newton-cg
For K=(3,), mean accuracy = 0.643
C = 0.001 | Solver = lbfgs
For K=(3,), mean accuracy = 0.646
C = 0.001 | Solver = sag
For K=(3,), mean accuracy = 0.643
C = 0.01 | Solver = newton-cg
For K=(3,), mean accuracy = 0.643
C = 0.01 | Solver = lbfgs
For K=(3,), mean accuracy = 0.644
C = 0.01 | Solver = sag
For K=(3,), mean accuracy = 0.641
C = 0.1 | Solver = newton-cg
For K=(3,), mean accuracy = 0.642
C = 0.1 | Solver = lbfgs
For K=(3,), mean accuracy = 0.641
C = 0.1 | Solver = sag
For K=(3,), mean accuracy = 0.645
C = 1 | Solver = newton-cg
For K=(3,), mean accuracy = 0.644
C = 1 | Solver = lbf

In [34]:
log_reg(df, (4,))

For K=(4,), mean accuracy = 0.602


In [35]:
log_reg(df, (5,))

For K=(5,), mean accuracy = 0.543


In [36]:
log_reg(df, (2, 3, 4, 5))

For K=(2, 3, 4, 5), mean accuracy = 0.748


In [44]:
log_reg(df, (2, 3, 4, 5), pca_components=2)

(10000, 1360)
(10000, 2)
For K=(2, 3, 4, 5), mean accuracy = 0.176


(0.1756, 4.652177810668945)