In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

### Data preparation

In [2]:
data = pd.read_csv("./data/data_t_sort_all.txt", sep = '\t')
X = data.drop(columns='...54043')
y = data['...54043'].astype(int)
y.to_csv('./data/y_DF.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)
data_frame = pd.DataFrame(X_train.index)
data_frame.to_csv('train_id.csv', encoding='utf-8', index=False, header=False)

In [4]:
print(X_train.shape[0], X_test.shape[0], y_train.shape[0], y_test.shape[0])

435 146 435 146


In [72]:
y = y.replace([1, 2, 3, 4, 5], [1, 2, 1, 2, 2])

In [73]:
Degup = pd.read_csv("./data/DEG_up.csv", sep = ' ', header=None)
Degdown = pd.read_csv("./data/DEG_down.csv", sep = ' ', header=None)

In [74]:
columns_to_keep = Degup[0].tolist()
columns_to_keep.extend(Degdown[0].tolist())
X_sort = X[columns_to_keep]
len(columns_to_keep)

500

### Count tpm

In [75]:
def read_counts2tpm(df):
    """
    convert read counts to TPM (transcripts per million)
    :df: a dataFrame that contains the read count with its gene length. 
    :sample_reads: read count values for all transcripts
    :gene_len: Gene length values
    :return: TPM
    """
    result = df
    sample_reads = result.loc[:, result.columns != 'length'].copy()
    gene_len = result.loc[:, ['length']]
    normalize_by_genelength = sample_reads.values / gene_len.values
    scaling_factor = (np.sum(normalize_by_genelength, axis=0).reshape(1, -1))/1e6
    normalize_sequencingdepth = normalize_by_genelength / scaling_factor
    tpm = normalize_sequencingdepth
    return tpm

In [76]:
Gene_Length = pd.read_csv(r'./data/gene_length_1.txt', delimiter = '\t', header=None)
Gene_Length.columns =["Ensembl_gene_identifier", "length"]
Gene_Length.head()

Unnamed: 0,Ensembl_gene_identifier,length
0,SNX18P15,954
1,SNX18P16,396
2,ANKRD20A12P,1744
3,ANKRD20A15P,1512
4,LOC105379272,972


In [77]:
Gene_Length.shape[0]

54042

In [78]:
data_test = X_sort.T
data_test.insert(0, 'feature', X_sort.T.index)
data_test

Unnamed: 0,feature,1a5663fb-1d6a-466b-9e9b-935ad1262408,3cae08cc-e9c7-49e7-8537-8fff5ecef617,e716f762-f1ed-4298-b23d-9e53bec6d318,c51ab8c2-6a1b-46e3-8c5f-42752582911c,f8c9b9c1-a660-4e36-a78e-45961c276721,4b77d1d3-f2f4-4d77-ae22-42a41dc03721,2b43d1e1-2be7-4c16-9b83-bcd37e9f0129,14f4df36-ccb5-41e5-8253-83b811faea2a,60e167cc-fc54-4587-947d-0d972ad0cf2e,...,440b3cc3-0635-4a54-be2b-cc019ac85ef6,f59229d8-3570-4d7a-aed9-4ad6f128c6f9,82f42146-d66d-488e-a7c8-bb338bf5d5fb,10ee4308-732e-4a38-9f76-e779973316fb,08d83eb6-c83c-409f-869d-da303acca1fb,62c55da6-8041-42c4-bea8-62f24c4537fd,640cb992-2f3d-480b-8676-9661a5fd4503,ab3b48ed-cff9-4d66-bf3a-e9267d57ba14,02d75fdc-87ae-4726-977b-b4537036b94e,766d5820-4c4a-4a9e-8059-97b66f19fb9f
INS,INS,0,0,0,0,0,0,0,81,0,...,0,96,0,0,0,0,0,0,0,501301
CGA,CGA,755,0,192,0,0,96,473,815,473,...,1403,1777,1037,189,615528,9263,2164,280,0,0
DPPA2,DPPA2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CTAG1B,CTAG1B,192,0,0,0,0,0,0,0,0,...,0,192,47467,0,0,11,0,0,0,0
GAGE12G,GAGE12G,0,0,0,0,0,0,0,0,0,...,0,0,2758,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NAV2.AS5,NAV2.AS5,0,0,0,0,0,0,0,0,96,...,0,0,96,288,0,0,30,0,0,0
OR5P2,OR5P2,0,0,0,96,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RNU6.9,RNU6.9,71,0,0,0,68,62,148,151,48,...,0,153,0,153,286,142,0,146,0,0
SNORA94,SNORA94,0,0,89,48,0,281,96,374,0,...,48,43,96,94,288,0,252,205,79,104


In [79]:
data_test.shape[0]

500

In [80]:
mergedDf = data_test.merge(Gene_Length, left_on='feature', right_on='Ensembl_gene_identifier')
del mergedDf['Ensembl_gene_identifier']
mergedDf.head()

Unnamed: 0,feature,1a5663fb-1d6a-466b-9e9b-935ad1262408,3cae08cc-e9c7-49e7-8537-8fff5ecef617,e716f762-f1ed-4298-b23d-9e53bec6d318,c51ab8c2-6a1b-46e3-8c5f-42752582911c,f8c9b9c1-a660-4e36-a78e-45961c276721,4b77d1d3-f2f4-4d77-ae22-42a41dc03721,2b43d1e1-2be7-4c16-9b83-bcd37e9f0129,14f4df36-ccb5-41e5-8253-83b811faea2a,60e167cc-fc54-4587-947d-0d972ad0cf2e,...,f59229d8-3570-4d7a-aed9-4ad6f128c6f9,82f42146-d66d-488e-a7c8-bb338bf5d5fb,10ee4308-732e-4a38-9f76-e779973316fb,08d83eb6-c83c-409f-869d-da303acca1fb,62c55da6-8041-42c4-bea8-62f24c4537fd,640cb992-2f3d-480b-8676-9661a5fd4503,ab3b48ed-cff9-4d66-bf3a-e9267d57ba14,02d75fdc-87ae-4726-977b-b4537036b94e,766d5820-4c4a-4a9e-8059-97b66f19fb9f,length
0,INS,0,0,0,0,0,0,0,81,0,...,96,0,0,0,0,0,0,0,501301,644
1,CGA,755,0,192,0,0,96,473,815,473,...,1777,1037,189,615528,9263,2164,280,0,0,844
2,DPPA2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1630
3,CTAG1B,192,0,0,0,0,0,0,0,0,...,192,47467,0,0,11,0,0,0,0,748
4,GAGE12G,0,0,0,0,0,0,0,0,0,...,0,2758,0,0,0,0,0,0,0,567


In [81]:
mergedDf = mergedDf.drop_duplicates(subset=['feature'])
ser = data_test['feature'].isin(mergedDf['feature'])
print(ser.index[~ser])
# print(data_test['feature'].isin(['TMEM132D']).any
mergedDf.shape[0]

Index([], dtype='object')


500

In [82]:
Columns = list(mergedDf.columns) 
Columns.remove('feature')
Columns.remove('length')
feature = list(mergedDf['feature'])

In [84]:
newmergedDf = mergedDf.set_index('feature')
newmergedDf.head()

Unnamed: 0_level_0,1a5663fb-1d6a-466b-9e9b-935ad1262408,3cae08cc-e9c7-49e7-8537-8fff5ecef617,e716f762-f1ed-4298-b23d-9e53bec6d318,c51ab8c2-6a1b-46e3-8c5f-42752582911c,f8c9b9c1-a660-4e36-a78e-45961c276721,4b77d1d3-f2f4-4d77-ae22-42a41dc03721,2b43d1e1-2be7-4c16-9b83-bcd37e9f0129,14f4df36-ccb5-41e5-8253-83b811faea2a,60e167cc-fc54-4587-947d-0d972ad0cf2e,f899311a-9ba9-45eb-a8f4-da11b9418e32,...,f59229d8-3570-4d7a-aed9-4ad6f128c6f9,82f42146-d66d-488e-a7c8-bb338bf5d5fb,10ee4308-732e-4a38-9f76-e779973316fb,08d83eb6-c83c-409f-869d-da303acca1fb,62c55da6-8041-42c4-bea8-62f24c4537fd,640cb992-2f3d-480b-8676-9661a5fd4503,ab3b48ed-cff9-4d66-bf3a-e9267d57ba14,02d75fdc-87ae-4726-977b-b4537036b94e,766d5820-4c4a-4a9e-8059-97b66f19fb9f,length
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INS,0,0,0,0,0,0,0,81,0,0,...,96,0,0,0,0,0,0,0,501301,644
CGA,755,0,192,0,0,96,473,815,473,96,...,1777,1037,189,615528,9263,2164,280,0,0,844
DPPA2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1630
CTAG1B,192,0,0,0,0,0,0,0,0,0,...,192,47467,0,0,11,0,0,0,0,748
GAGE12G,0,0,0,0,0,0,0,0,0,0,...,0,2758,0,0,0,0,0,0,0,567


In [85]:
newmergedDf.shape[0]

500

In [86]:
TPM_MasterCount = pd.DataFrame(read_counts2tpm(newmergedDf))

In [87]:
TPM_MasterCount.columns = Columns
TPM_MasterCount.insert(0, 'feature', feature, True)
TPM_MasterCount.set_index('feature')

Unnamed: 0_level_0,1a5663fb-1d6a-466b-9e9b-935ad1262408,3cae08cc-e9c7-49e7-8537-8fff5ecef617,e716f762-f1ed-4298-b23d-9e53bec6d318,c51ab8c2-6a1b-46e3-8c5f-42752582911c,f8c9b9c1-a660-4e36-a78e-45961c276721,4b77d1d3-f2f4-4d77-ae22-42a41dc03721,2b43d1e1-2be7-4c16-9b83-bcd37e9f0129,14f4df36-ccb5-41e5-8253-83b811faea2a,60e167cc-fc54-4587-947d-0d972ad0cf2e,f899311a-9ba9-45eb-a8f4-da11b9418e32,...,440b3cc3-0635-4a54-be2b-cc019ac85ef6,f59229d8-3570-4d7a-aed9-4ad6f128c6f9,82f42146-d66d-488e-a7c8-bb338bf5d5fb,10ee4308-732e-4a38-9f76-e779973316fb,08d83eb6-c83c-409f-869d-da303acca1fb,62c55da6-8041-42c4-bea8-62f24c4537fd,640cb992-2f3d-480b-8676-9661a5fd4503,ab3b48ed-cff9-4d66-bf3a-e9267d57ba14,02d75fdc-87ae-4726-977b-b4537036b94e,766d5820-4c4a-4a9e-8059-97b66f19fb9f
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INS,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,38.408411,0.000000,0.000000,...,0.000000,1.964940,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,68260.663292
CGA,856.488289,0.000000,184.548491,0.000000,0.000000,6.117815,498.821171,294.877988,221.177105,194.387795,...,2265.560866,27.752935,231.337237,69.672345,163837.123346,3926.214111,264.044193,16.210176,0.000000,0.000000
DPPA2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CTAG1B,245.763031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,3.383480,11948.115768,0.000000,0.000000,5.260849,0.000000,0.000000,0.000000,0.000000
GAGE12G,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,915.841752,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NAV2.AS5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.456673,49.610916,...,0.000000,0.000000,5.465706,27.095637,0.000000,0.000000,0.934219,0.000000,0.000000,0.000000
OR5P2,0.000000,0.000000,0.000000,406.641178,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RNU6.9,635.318490,0.000000,0.000000,0.000000,171.413733,31.165598,1231.130587,430.943527,177.043071,479.156830,...,0.000000,18.848274,0.000000,444.885981,600.467080,474.754761,0.000000,66.671655,0.000000,0.000000
SNORA94,0.000000,0.000000,312.557370,852.890263,0.000000,65.427737,369.900942,494.409509,0.000000,0.000000,...,283.197730,2.453694,78.247143,126.606773,280.083455,0.000000,112.344127,43.362473,1626.839515,39.480183


In [88]:
TPM_MasterCount.shape[0]

500

### Create model

In [89]:
print(TPM_MasterCount.sum(axis = 0, skipna = True))
X_tpm = TPM_MasterCount.T
X_tpm.columns = X_tpm.iloc[0]
X_tpm = X_tpm[1:]

feature                                 INSCGADPPA2CTAG1BGAGE12GLINC01419VGLL2LOC10537...
1a5663fb-1d6a-466b-9e9b-935ad1262408                                            1000000.0
3cae08cc-e9c7-49e7-8537-8fff5ecef617                                            1000000.0
e716f762-f1ed-4298-b23d-9e53bec6d318                                            1000000.0
c51ab8c2-6a1b-46e3-8c5f-42752582911c                                            1000000.0
                                                              ...                        
62c55da6-8041-42c4-bea8-62f24c4537fd                                            1000000.0
640cb992-2f3d-480b-8676-9661a5fd4503                                            1000000.0
ab3b48ed-cff9-4d66-bf3a-e9267d57ba14                                            1000000.0
02d75fdc-87ae-4726-977b-b4537036b94e                                            1000000.0
766d5820-4c4a-4a9e-8059-97b66f19fb9f                                            1000000.0
Length: 52

In [90]:
X_tpm
X_tpm.to_csv('./data/X_tpm_DF.csv')

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tpm, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)

#### LogisticRegression mogel on counts value

In [24]:
logreg = LogisticRegression(solver='newton-cg', penalty='l2', max_iter=500)
logreg.fit(X_train, y_train)

In [25]:
y_pred = logreg.predict(X_test)

In [26]:
print(f1_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='weighted'))

0.6370242876266973 0.6438356164383562 0.645197882200688


#### LogisticRegression mogel on log10 counts value

In [27]:
X_tpm_log = X_tpm.applymap(lambda x: np.log10(x + 1))
X_train, X_test, y_train, y_test = train_test_split(X_tpm_log, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)

  X_tpm_log = X_tpm.applymap(lambda x: np.log10(x + 1))


In [28]:
X_tpm_log

feature,INS,CGA,DPPA2,CTAG1B,GAGE12G,LINC01419,VGLL2,LOC105370470,TSPY2,LINC01896,...,LOC105376072,TAS2R50,TAS2R50.1,TAS2R50.2,LOC105372787,OR5P2,OR5P2.1,OR5P2.2,SNORA94,IHH
X1a5663fb.1d6a.466b.9e9b.935ad1262408,0.000000,3.017671,0.0,2.476502,0.0,3.463339,1.681218,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,2.065794,0.000000,0.000000,0.000000,0.000000,1.924718
X9b145f65.ca1d.4746.9672.db8ee767c80f,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.597809,0.000000,0.000000,0.000000,...,1.454384,1.464515,1.464515,1.464515,1.385931,0.665605,0.665605,0.665605,0.000000,2.711352
X3cae08cc.e9c7.49e7.8537.8fff5ecef617,0.000000,0.000000,0.0,0.000000,0.0,1.527382,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,2.092268,0.000000,0.000000,0.000000,0.000000,3.336614
e716f762.f1ed.4298.b23d.9e53bec6d318,0.000000,1.831204,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.057391,2.045195
c51ab8c2.6a1b.46e3.8c5f.42752582911c,0.000000,0.000000,0.0,0.000000,0.0,0.000000,2.204016,0.000000,0.000000,0.000000,...,0.000000,2.296040,2.296040,2.296040,2.819166,2.591928,2.591928,2.591928,2.913028,4.550819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X62c55da6.8041.42c4.bea8.62f24c4537fd,0.000000,3.614908,0.0,0.814201,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.494672
X640cb992.2f3d.480b.8676.9661a5fd4503,0.000000,2.445428,0.0,0.000000,0.0,0.000000,1.496015,1.067107,0.000000,0.000000,...,0.000000,1.629597,1.629597,1.629597,1.069984,0.000000,0.000000,0.000000,2.076400,1.772455
ab3b48ed.cff9.4d66.bf3a.e9267d57ba14,0.000000,1.216938,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.885824,0.470471,...,0.000000,1.273628,1.273628,1.273628,0.750303,0.000000,0.000000,0.000000,1.627441,1.535932
X02d75fdc.87ae.4726.977b.b4537036b94e,0.000000,0.000000,0.0,0.000000,0.0,0.000000,2.241491,0.000000,0.000000,0.000000,...,2.446721,0.000000,0.000000,0.000000,2.329971,0.000000,0.000000,0.000000,3.166880,0.000000


In [29]:
logreg = LogisticRegression(solver='newton-cg', penalty='l2', max_iter=500)
logreg.fit(X_train, y_train)

In [30]:
y_pred = logreg.predict(X_test)

In [31]:
print(f1_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='weighted'))

0.6789061005928476 0.684931506849315 0.6861365881006085


#### ARCHS4 TFs Coexp

In [32]:
res = pd.read_csv("./data/DESeq_res.csv", sep = ' ')

In [33]:
res = res.dropna()
res['log2FoldChange'] = res['log2FoldChange'].apply(lambda x: abs(x))
gene_list = res.sort_values(['log2FoldChange', 'padj'], ascending=[False, True]).index[:300]
with open(r'gene_list.txt', 'w') as fp:
    for item in gene_list:
        fp.write("%s\n" % item)
    print('Done')

Done


#### Enrichr result:

1	RHOXF2 human tf ARCHS4 coexpression, p_value = 0.00001267


2	GATA1 human tf ARCHS4 coexpression, p_value = 0.005699


3	CTCFL human tf ARCHS4 coexpression, p_value = 0.0056699


4	MAEL human tf ARCHS4 coexpression, p_value = 0.01523


5	TFDP3 human tf ARCHS4 coexpression, p_value = 0.01523


6	ZNF595 human tf ARCHS4 coexpression, p_value = 0.03717

In [34]:
X_sort_gene = X[["RHOXF2", "GATA1", "CTCFL", "MAEL", "TFDP3", "ZNF595"]]
X_sort_gene

Unnamed: 0,RHOXF2,GATA1,CTCFL,MAEL,TFDP3,ZNF595
X1a5663fb.1d6a.466b.9e9b.935ad1262408,0,250,0,942,0,13066
X9b145f65.ca1d.4746.9672.db8ee767c80f,576,1123,95,48,0,19970
X3cae08cc.e9c7.49e7.8537.8fff5ecef617,0,1091,0,884,0,22624
e716f762.f1ed.4298.b23d.9e53bec6d318,0,0,0,672,0,24600
c51ab8c2.6a1b.46e3.8c5f.42752582911c,288,382,96,912,0,24749
...,...,...,...,...,...,...
X62c55da6.8041.42c4.bea8.62f24c4537fd,0,44,0,0,1605,14418
X640cb992.2f3d.480b.8676.9661a5fd4503,191,1131,286,1012,96,34910
ab3b48ed.cff9.4d66.bf3a.e9267d57ba14,0,663,90,379,0,44656
X02d75fdc.87ae.4726.977b.b4537036b94e,96,955,0,475,0,17124


In [35]:
data_test = X_sort_gene.T
data_test.insert(0, 'feature', X_sort_gene.T.index)
mergedDf = data_test.merge(Gene_Length, left_on='feature', right_on='Ensembl_gene_identifier')
del mergedDf['Ensembl_gene_identifier']
Columns = list(mergedDf.columns) 
Columns.remove('feature')
Columns.remove('length')
feature = list(mergedDf['feature'])
newmergedDf = mergedDf.set_index('feature')
TPM_MasterCount = pd.DataFrame(read_counts2tpm(newmergedDf))
TPM_MasterCount.columns = Columns
TPM_MasterCount.insert(0, 'feature', feature, True)
TPM_MasterCount.set_index('feature')
X_tpm = TPM_MasterCount.T
X_tpm.columns = X_tpm.iloc[0]
X_tpm = X_tpm[1:]
X_tpm

feature,RHOXF2,GATA1,CTCFL,MAEL,TFDP3,ZNF595
X1a5663fb.1d6a.466b.9e9b.935ad1262408,0.0,39770.373147,0.0,84956.595196,0.0,875273.031657
X9b145f65.ca1d.4746.9672.db8ee767c80f,30509.069966,113673.430845,1849.452466,2754.52671,0.0,851213.520013
X3cae08cc.e9c7.49e7.8537.8fff5ecef617,0.0,98119.969939,0.0,45072.480537,0.0,856807.549524
e716f762.f1ed.4298.b23d.9e53bec6d318,0.0,0.0,0.0,35472.680056,0.0,964527.319944
c51ab8c2.6a1b.46e3.8c5f.42752582911c,13116.051969,33246.561096,1606.922593,44999.194982,0.0,907031.26936
...,...,...,...,...,...,...
X62c55da6.8041.42c4.bea8.62f24c4537fd,0.0,5807.733747,0.0,0.0,192808.982185,801383.284068
X640cb992.2f3d.480b.8676.9661a5fd4503,6003.592186,67938.060984,3304.128527,34463.358209,5248.316041,883042.544053
ab3b48ed.cff9.4d66.bf3a.e9267d57ba14,0.0,33655.432814,878.667408,10907.052424,0.0,954558.847353
X02d75fdc.87ae.4726.977b.b4537036b94e,5920.079255,112546.601548,0.0,31735.778959,0.0,849797.540237


In [36]:
X_tpm.to_csv('X_tpm_TF.csv')
y.to_csv('y_TF.csv')

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_tpm, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)

In [38]:
logreg = LogisticRegression(solver='newton-cg', penalty='l2', max_iter=500)
logreg.fit(X_train, y_train)

In [45]:
y_pred = logreg.predict(X_test)
print(y_pred)
print(y_test.values)

[1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[2 1 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 2 2 2 1 2 1 2 1 2 1 1 2 1 2 2 1 1 1 2 2
 1 2 2 2 2 1 1 2 2 1 2 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 2 2 2 1 1 1 2 2 1 1
 2 1 2 2 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 1 1 2 2 1 1 2 2 1 2
 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 2 2 2 2 2 1 2 1 1 2 2 2 2 2 1]


In [47]:
print(f1_score(y_test.values, y_pred, average='macro'), f1_score(y_test.values, y_pred, average='micro'), f1_score(y_test, y_pred, average='weighted'))

0.3865546218487395 0.5616438356164384 0.4404282260849545


In [48]:
X_tpm_log = X_tpm.applymap(lambda x: np.log2(x + 1))
X_train, X_test, y_train, y_test = train_test_split(X_tpm_log, 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)

  X_tpm_log = X_tpm.applymap(lambda x: np.log2(x + 1))


In [49]:
logreg = LogisticRegression(solver='newton-cg', penalty='l2', max_iter=500)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(f1_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='weighted'))

0.4872210514940293 0.5547945205479452 0.5178203582354252
