In [13]:
import os
os.chdir("C:/Users/anton/Desktop/University/AI_Lab/Git_repo")

In [2]:
import pandas as pd
import numpy as np
from fix_data import add_label_T
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In this section we fit a classifier for the dataset SmartSeq MCF7. From our preliminary analysis it seemed that a linear model could do a very good job, thus we try Logistic Regression. Linear models, not being complex generalize well to new data, without risking overfitting; moreover, they give us a great result interpretability.

In [14]:
df = pd.read_csv("data/SmartSeq/MCF7_SmartS_Filtered_Normalised_3000_Data_train.txt", sep=" ")
df = add_label_T(df, dropname=True)
df

Unnamed: 0,CYP1B1,CYP1B1-AS1,CYP1A1,NDRG1,DDIT4,PFKFB3,HK2,AREG,MYBL2,ADM,...,DNAI7,MAFG,LZTR1,BCO2,GRIK5,SLC25A27,DENND5A,CDK5R1,FAM13A-AS1,label
0,343,140,0,0,386,75,0,0,476,0,...,0,17,59,0,0,0,51,0,0,1
1,131,59,0,1,289,42,0,856,1586,0,...,0,1,25,0,0,0,34,0,0,1
2,452,203,0,0,0,0,0,0,775,0,...,0,23,0,0,0,0,0,0,0,1
3,27,7,0,0,288,214,15,242,1191,0,...,0,3,10,0,0,0,4,0,0,1
4,5817,2669,0,654,2484,1603,748,436,0,0,...,0,63,0,0,0,0,60,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,196,102,1,243,266,278,78,1,199,0,...,0,1,0,0,0,0,45,19,0,1
246,504,238,0,62,417,932,275,0,33,0,...,0,0,0,0,0,0,0,0,0,1
247,34565,13717,11274,1263,4256,5663,701,0,0,0,...,0,45,0,0,0,0,140,0,32,0
248,20024,7835,563,925,12733,2187,1141,0,20,81,...,0,57,8,0,0,0,7,0,11,0


Given our high-dimensionality context, we use the L1 penanlty in our logistic regression in order to induce sparsity in the model, thus finding the responsible genes.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("label", axis=1), df["label"], test_size=0.2, random_state=42)

We try C=0.1

In [16]:
log_reg = LogisticRegression(penalty="l1", C=0.1, solver="liblinear", random_state=42)
cross_val_score(log_reg, X_train, y_train).mean()

1.0

It scores perfectly on the cross validation, thus we are going to test it

In [17]:
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

1.0

Again a perfect score, we can conclude that our model is very good. Now let's how many, and which, genes influence the label.

In [18]:
feature_names = df.columns.tolist()
coef = log_reg.coef_.tolist()[0]
non_zero_coef = list()
for i, c in enumerate(coef):
    if abs(c) > 0:
        non_zero_coef.append(feature_names[i])
print("Number of non zero coeffiecients:", len(non_zero_coef))
print("Relevant genes:", non_zero_coef)

Number of non zero coeffiecients: 36
Relevant genes: ['CYP1B1', 'DDIT4', 'NR4A1', 'CEACAM5', 'TUBA1B', 'MT-CYB', 'SLC9A3R1', 'XBP1', 'MT-CO3', 'GPI', 'EMP2', 'MT-CO2', 'PGK1', 'LDHA', 'MT-CO1', 'FLNA', 'TMSB10', 'BEST1', 'DSP', 'FTH1', 'HSPB1', 'DHCR7', 'SULF2', 'ATP1A1', 'ACTG1', 'FTL', 'MT-RNR2', 'UBB', 'GAPDH', 'GATA3', 'LMNA', 'DDX5', 'TMSB4X', 'MT-ND4', 'NME1-NME2', 'ALDOA']


## Predictions

In [19]:
X_pred = pd.read_csv("data/SmartSeq/MCF7_SmartS_Filtered_Normalised_3000_Data_test_anonim.txt", sep=" ")

In [20]:
X_pred = X_pred.T
X_pred

Unnamed: 0,CYP1B1,CYP1B1-AS1,CYP1A1,NDRG1,DDIT4,PFKFB3,HK2,AREG,MYBL2,ADM,...,CD27-AS1,DNAI7,MAFG,LZTR1,BCO2,GRIK5,SLC25A27,DENND5A,CDK5R1,FAM13A-AS1
1,492,253,0,1157,6805,5447,2662,2448,0,58,...,0,0,193,0,0,0,0,285,0,1
2,7199,3245,7181,1857,20731,4374,5490,0,0,2150,...,0,0,187,0,0,0,0,55,0,0
3,12,11,1,5,147,301,9,647,1890,1,...,0,0,28,0,0,0,0,1,0,0
4,373,187,0,0,43,37,0,0,580,0,...,21,0,131,0,0,0,0,3,0,0
5,31,13,0,0,0,44,8,0,554,0,...,43,0,30,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,287,126,666,3270,25776,12144,1119,0,0,678,...,0,0,473,0,0,0,0,315,0,26
60,15220,6316,1991,750,12176,7488,3172,2030,0,64,...,0,0,36,0,0,0,34,78,0,0
61,21998,8898,21329,1498,5144,1146,914,0,0,0,...,69,0,0,0,0,0,0,0,0,35
62,39,17,1,29,20,0,0,88,1399,0,...,12,0,0,0,0,0,0,0,0,0


In [21]:
y_pred = log_reg.predict(X_pred)
y_pred

array([0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
      dtype=int64)

In [12]:
with open("predictions/smartseq_mcf7_predictions.txt", 'w') as file:
        for n in y_pred:
            label = "Normoxia" if n == 1 else "Hypoxia"
            file.write(label + '\n')