# Forward seletion timing estimation

## Load libraries

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector

## Load dataset

In [110]:
combined_df = pd.read_csv("./data/Microbiota_composition.csv")
# drop the Unnamed column 
combined_df.drop(columns=['Unnamed: 0'], inplace=True)
combined_df.head()

Unnamed: 0,condition,Streptococcus_anginosus___ref_mOTU_v2_0004__,Enterobacteriaceae_sp.___ref_mOTU_v2_0036__,Citrobacter_sp.___ref_mOTU_v2_0076__,Klebsiella_michiganensis/oxytoca___ref_mOTU_v2_0079__,Enterococcus_faecalis___ref_mOTU_v2_0116__,Lactobacillus_salivarius___ref_mOTU_v2_0125__,Dielma_fastidiosa___ref_mOTU_v2_0138__,Streptococcus_constellatus/intermedius___ref_mOTU_v2_0143__,Streptococcus_parasanguinis___ref_mOTU_v2_0144__,...,unknown_Porphyromonas___meta_mOTU_v2_7777__,unknown_Clostridiales___meta_mOTU_v2_7778__,unknown_Clostridiales___meta_mOTU_v2_7781__,unknown_Clostridiales___meta_mOTU_v2_7782__,unknown_Clostridiales___meta_mOTU_v2_7784__,Clostridium_sp._CAG__230___meta_mOTU_v2_7788__,Clostridium_sp._CAG__1193___meta_mOTU_v2_7789__,unknown_Erysipelotrichaceae___meta_mOTU_v2_7790__,unknown_Clostridiales___meta_mOTU_v2_7795__,unknown_Clostridiales___meta_mOTU_v2_7800__
0,control,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,0.0,0.000249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,8.3e-05
1,control,0.000591,6.6e-05,0.0,0.0,0.0,0.0,0.0,6.6e-05,0.002102,...,0.0,0.0,0.0,0.000788,0.0,0.0,0.0,0.0,0.0,0.0
2,control,0.00084,6.5e-05,0.002454,0.0,6.5e-05,0.013111,0.0,6.5e-05,0.012013,...,0.0,0.0,0.000194,6.5e-05,0.0,0.00084,0.0,0.0,0.001808,0.0
3,control,0.0,7.1e-05,0.0,0.0,0.0,0.0,0.000213,7.1e-05,0.000142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,control,8.5e-05,0.019236,0.001111,0.000256,8.5e-05,8.5e-05,0.0,0.0,0.000513,...,0.0,0.000171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "condition"

X = combined_df.drop(target_variable, axis = 1)
Y = combined_df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
X.head()

Separating labels from features...
...Done.

Y : 
0    control
1    control
2    control
3    control
4    control
Name: condition, dtype: object

X :


Unnamed: 0,Streptococcus_anginosus___ref_mOTU_v2_0004__,Enterobacteriaceae_sp.___ref_mOTU_v2_0036__,Citrobacter_sp.___ref_mOTU_v2_0076__,Klebsiella_michiganensis/oxytoca___ref_mOTU_v2_0079__,Enterococcus_faecalis___ref_mOTU_v2_0116__,Lactobacillus_salivarius___ref_mOTU_v2_0125__,Dielma_fastidiosa___ref_mOTU_v2_0138__,Streptococcus_constellatus/intermedius___ref_mOTU_v2_0143__,Streptococcus_parasanguinis___ref_mOTU_v2_0144__,Streptococcus_sp._HSISM1___ref_mOTU_v2_0145__,...,unknown_Porphyromonas___meta_mOTU_v2_7777__,unknown_Clostridiales___meta_mOTU_v2_7778__,unknown_Clostridiales___meta_mOTU_v2_7781__,unknown_Clostridiales___meta_mOTU_v2_7782__,unknown_Clostridiales___meta_mOTU_v2_7784__,Clostridium_sp._CAG__230___meta_mOTU_v2_7788__,Clostridium_sp._CAG__1193___meta_mOTU_v2_7789__,unknown_Erysipelotrichaceae___meta_mOTU_v2_7790__,unknown_Clostridiales___meta_mOTU_v2_7795__,unknown_Clostridiales___meta_mOTU_v2_7800__
0,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,0.0,0.000249,0.000249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3e-05,8.3e-05
1,0.000591,6.6e-05,0.0,0.0,0.0,0.0,0.0,6.6e-05,0.002102,0.000197,...,0.0,0.0,0.0,0.000788,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00084,6.5e-05,0.002454,0.0,6.5e-05,0.013111,0.0,6.5e-05,0.012013,0.002777,...,0.0,0.0,0.000194,6.5e-05,0.0,0.00084,0.0,0.0,0.001808,0.0
3,0.0,7.1e-05,0.0,0.0,0.0,0.0,0.000213,7.1e-05,0.000142,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.5e-05,0.019236,0.001111,0.000256,8.5e-05,8.5e-05,0.0,0.0,0.000513,0.0,...,0.0,0.000171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
Y = Y.apply(lambda x: 1 if x == "CRC" else 0)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [113]:
# Preprocessing

print("Preprocessing X_train...")
print(X_train.head())
print()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array

Preprocessing X_train...
     Streptococcus_anginosus___ref_mOTU_v2_0004__  \
60                                       0.000000   
618                                      0.000000   
346                                      0.000000   
294                                      0.000036   
231                                      0.000000   

     Enterobacteriaceae_sp.___ref_mOTU_v2_0036__  \
60                                      0.000815   
618                                     0.000191   
346                                     0.000138   
294                                     0.000036   
231                                     0.322280   

     Citrobacter_sp.___ref_mOTU_v2_0076__  \
60                                    0.0   
618                                   0.0   
346                                   0.0   
294                                   0.0   
231                                   0.0   

     Klebsiella_michiganensis/oxytoca___ref_mOTU_v2_0079__  \
60        

In [114]:
print("Preprocessing X_test...")
print(X_test.head())
print()
X_test = scaler.transform(X_test) # don't fit again !
print("...Done!")
print(X_test[0:5,:]) # X_train is now a numpy array

Preprocessing X_test...
     Streptococcus_anginosus___ref_mOTU_v2_0004__  \
668                                      0.000043   
324                                      0.000000   
624                                      0.000236   
690                                      0.000000   
473                                      0.000000   

     Enterobacteriaceae_sp.___ref_mOTU_v2_0036__  \
668                                     0.008927   
324                                     0.000326   
624                                     0.351001   
690                                     0.000078   
473                                     0.000331   

     Citrobacter_sp.___ref_mOTU_v2_0076__  \
668                              0.000000   
324                              0.000000   
624                              0.001649   
690                              0.000000   
473                              0.000000   

     Klebsiella_michiganensis/oxytoca___ref_mOTU_v2_0079__  \
668        

In [115]:
# Import PCA 
from sklearn.decomposition import PCA

# Only keep features that explains 15% of the variance
pca = PCA(.90)

# Fit model to X_train
X_pca_train = pca.fit_transform(X_train)
X_pca_train[:5]
# Transform X_test
X_pca_test = pca.transform(X_test)
X_pca_test


array([[ 0.97695161, -0.98599676,  3.13204582, ...,  0.37344478,
         0.22760776,  1.36469473],
       [ 4.73201152, -1.68548266,  2.06920646, ..., -1.5737351 ,
        -1.23504276, -0.8780786 ],
       [-4.45984367, -0.20272415,  2.09180743, ..., -0.7204473 ,
         0.83410545,  0.62586677],
       ...,
       [ 0.17951678, -2.56858947, -0.63284493, ...,  1.18353966,
         0.08585856, -0.7183375 ],
       [ 1.54858019, -1.48711769,  3.08193113, ..., -0.14099999,
         0.11538098,  0.33070451],
       [ 0.80541983, -3.51102265,  3.27259883, ...,  0.57739862,
        -0.65234629, -0.92323869]])

## Fit before PCA

#### Random Forest

In [116]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, Y_train)
classifier.score(X_test, Y_test)

0.7662337662337663

#### SVM

In [117]:
# classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# classifier.fit(X_train, Y_train)
# classifier.score(X_test, Y_test)

## After PCA

In [118]:
# Fit SVM again
classifier.fit(X_pca_train, Y_train)

# Get score
classifier.score(X_pca_test, Y_test)

0.6493506493506493