In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import scipy as sp
import matplotlib.pyplot as plt
import re
from os import listdir
from os.path import isfile, join
from scipy import stats
from matplotlib.ticker import MaxNLocator

import warnings
warnings.filterwarnings('ignore')

# Import Female rules Datasets 

In [2]:
df = pd.read_csv("./2022_results/rules_females_full.csv")

Formatting the dataset properly

In [3]:
#Split the string by reading the left hand and the right hand sides 
df['lhs'] = df['rules'].apply(lambda x : re.search(r"\{(.+)\} ", x).group(1))
df['rhs'] = df['rules'].apply(lambda x : re.search(r" \{(.+)\}", x).group(1))

#drop the rules column
df.drop(columns='rules',inplace = True)
#sort the columns differently 
df = df[['lhs','rhs','support','confidence','lift','count','coverage','pvalue_test']]

df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)
df['lhs'] = df['lhs'].str.replace('=1','')
df['count'] = df['count'].astype('int32')
df = df.sort_values(by=['support'], ascending=False)

Selecting only the severity rules with the chosen minsup and minconf

In [4]:
minconf_1 = 0.8
minsup_1 = 0.08

df[(df['confidence']>=minconf_1) 
                   & (df['support']>=minsup_1)
                   & (df['rhs']=='grading=1')
                  ]

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test
566,"PDE11A_2,PTPRU_1,PPP1R9A_1,MTHFD1_1_homo",grading=1,0.126984,0.864865,1.716110,32,0.146825,8.414474e-09
567,"PDE11A_2,PTPRU_1,PPP1R9A_1,HEXD_1",grading=1,0.126984,0.820513,1.628104,32,0.154762,8.414474e-09
621,"PDE11A_2,PPP1R9A_1,CCDC61_1_homo,MTHFD1_1_homo",grading=1,0.119048,0.810811,1.608853,30,0.146825,4.467429e-11
622,"IRX5_1,PPP1R9A_1,C1orf109_1,MTHFD1_1_homo",grading=1,0.119048,0.810811,1.608853,30,0.146825,3.794387e-08
565,"PDE11A_2,IRX5_1,PTPRU_1,PPP1R9A_1",grading=1,0.115079,0.878788,1.743737,29,0.130952,8.414474e-09
...,...,...,...,...,...,...,...,...
554,"PDE11A_2,IRX5_1,PTPRU_1,FABP1_1",grading=1,0.083333,0.875000,1.736220,21,0.095238,4.467429e-11
1984,"IRX5_1,PPP1R9A_1,C1orf109_1,APOBEC1_1_homo,A2M...",grading=1,0.083333,0.807692,1.602665,21,0.103175,3.794387e-08
600,"PPP1R9A_1,FABP1_1,MTHFD1_1_homo,RMI1_1_homo",grading=1,0.083333,0.913043,1.811708,21,0.091270,6.344291e-10
561,"PTPRU_1,HEXD_1,SLC24A3_1,FABP1_1",grading=1,0.083333,0.840000,1.666772,21,0.099206,2.926688e-12


Selecting only the mildness rules with the chosen minconf and minsup

In [5]:
minconf_0 = 0.8
minsup_0 = 0.08

df[(df['confidence']>=minconf_0) 
                   & (df['support']>=minsup_0)
                   & (df['rhs']=='grading=0')
                  ]

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test
393,"LSR_1,MYO1H_1,POLI_1_homo,APOBEC1_1_homo",grading=0,0.115079,0.805556,1.624,29,0.142857,5.212896e-09
597,"LSR_1,GSTM3_1,CHPT1_1,RBM11_1_homo",grading=0,0.107143,0.818182,1.649455,27,0.130952,1.087337e-08
1924,"LSR_1,TMEM40_2,RBM11_1_homo,CFHR4_1_homo,APOBE...",grading=0,0.107143,0.818182,1.649455,27,0.130952,2.376547e-10
1874,"RNF149_1,TMEM40_2,RBM11_1_homo,POLI_1_homo,APO...",grading=0,0.107143,0.818182,1.649455,27,0.130952,1.087337e-08
312,"FNDC9_1,GSTM3_1,TMEM40_2,APOBEC1_1_homo",grading=0,0.099206,0.833333,1.68,25,0.119048,1.036637e-10
369,"FHOD3_3,CHPT1_1,TMEM40_2,APOBEC1_1_homo",grading=0,0.095238,0.8,1.6128,24,0.119048,1.036637e-10
596,"LSR_1,GSTM3_1,IGFBP2_1,CHPT1_1",grading=0,0.095238,0.827586,1.668414,24,0.115079,6.318788e-09
1862,"LSR_1,IGFBP2_1,RBM11_1_homo,APOBEC1_1_homo,COQ...",grading=0,0.095238,0.857143,1.728,24,0.111111,1.087337e-08
267,"LSR_1,GJE1_1,RBM11_1_homo,APOBEC1_1_homo",grading=0,0.095238,0.8,1.6128,24,0.119048,1.087337e-08
1623,"LSR_1,NCOA3_1,TMEM40_2,RBM11_1_homo,APOBEC1_1_...",grading=0,0.095238,0.827586,1.668414,24,0.115079,6.318788e-09


# Importing severity and mildness genes

In [6]:
feature_importance = pd.read_csv('./Data/relevant genes in females.csv', sep=';')

In [7]:
feature_importance['complete'] = feature_importance['complete'].astype('str')
feature_importance[feature_importance['Type']=='Common Homozygous']['complete'] = feature_importance[feature_importance['Type']=='Common Homozygous']['complete'].apply(lambda x : "{}{}".format(x,'_homo'))
feature_importance['Feature Importance'] = feature_importance['Feature Importance'].str.replace(',','.')
feature_importance['Feature Importance'] = pd.to_numeric(feature_importance['Feature Importance'],errors='coerce')

In [8]:
feature_importance.loc[feature_importance['Type']=='Common Homozygous']['complete'] = feature_importance[feature_importance['Type']=='Common Homozygous']['complete'].apply(lambda x :  x +'_homo')

In [9]:
feature_importance.complete = feature_importance.apply(lambda row: row['complete'] + '_homo' if row['Type'] == 'Common Homozygous' else row['complete'], axis=1)

In [10]:
sus = feature_importance[feature_importance['Feature Importance']>=0]['complete'].tolist()
#sus = sus +['IRX5_1']
prot = feature_importance[feature_importance['Feature Importance']<0]['complete']

In [11]:
df80_1 = df[(df['confidence']>=minconf_1) 
                                          & (df['rhs']=='grading=1') 
                                          & (df['support']>=minsup_1)]

df80_1['mixed'] = df80_1.lhs.apply(lambda x : np.setdiff1d(x.split(','),sus))

Checking which severity rules have mixed genes

In [12]:
count= 0
for i in range(len(df80_1)):
    if df80_1.iloc[i]['mixed'].size > 0:
        print(df80_1.iloc[i]['mixed'])
        count+= 1
        
print("\n")
print(count, "mixed rules have been found in",len(df80_1),"severity rules.")

['APOBEC1_1_homo']
['APOBEC1_1_homo' 'SLC24A3_1']
['APOBEC1_1_homo']
['SLC24A3_1']
['SLC24A3_1']
['APOBEC1_1_homo']
['APOBEC1_1_homo']
['APOBEC1_1_homo']
['APOBEC1_1_homo']
['APOBEC1_1_homo']
['APOBEC1_1_homo']
['SLC24A3_1']


12 mixed rules have been found in 68 severity rules.


Checking which mildness rules have mixed genes

In [13]:
df80_0 = df[(df['confidence']>=minconf_0) 
                                          & (df['rhs']=='grading=0') 
                                          & (df['support']>=minsup_0)]

df80_0['mixed'] = df80_0.lhs.apply(lambda x : np.setdiff1d(x.split(','),prot))

In [14]:
count= 0
for i in range(len(df80_0)):
    if df80_0.iloc[i]['mixed'].size > 0:
        print(df80_0.iloc[i]['mixed'])
        count+= 1
        
print("\n")
print(count, "mixed rules have been found in",len(df80_0),"mildness rules.")

['PRSS55_1_homo']
['PRSS55_1_homo']


2 mixed rules have been found in 36 mildness rules.


# TABELLA CON RISPOSTA AL COVID E SODDISFAZIONE DELLE REGOLE

Considero sempre come regole quelle con i valori di min_sup e min_conf stabiliti

In [15]:
pat = pd.read_csv('./Data/All_Female_bool.csv')

In [16]:
pat0 = pat[pat['grading']==0]

In [18]:
grading0 = df[((df['confidence']>=minconf_0) 
                                          & (df['rhs']=='grading=0') 
                                          & (df['support']>=minsup_0))]

grading0['mixed'] = grading0.lhs.apply(lambda x : np.setdiff1d(x.split(','),prot))

grading0['lhs'] = grading0['lhs'].str.replace('.','-')
#grading_adj_age0['lhs'] = grading_adj_age0['lhs'].str.replace('KRTAP5;10','KRTAP5-10')

In [19]:
pat = pd.read_csv('./Data/All_Female_bool.csv')

In [20]:
pat1 = pat[pat['grading']==1]

In [21]:
grading1 = df[((df['confidence']>=minconf_1) 
                                          & (df['rhs']=='grading=1') 
                                          & (df['support']>=minsup_1))]

grading1['mixed'] = grading1.lhs.apply(lambda x : np.setdiff1d(x.split(','),sus))

grading1['lhs'] = grading1['lhs'].str.replace('.','-')
#grading_adj_age0['lhs'] = grading_adj_age0['lhs'].str.replace('KRTAP5;10','KRTAP5-10')

In [22]:
pat_index = pd.read_csv('./Data/All_Female_bool_index.csv')

In [23]:
#nella tabella riporto i soli pazienti con un grading maggiore o minore di covid
sel_pat = pat_index[pat_index['grading']!='none']['PatientID'].tolist()

### Building a dataframe with all the selected female patients along the rows and all the found rules (protection and severity) along the columns.

Each cell will contain a 1 if a patient satisfies the corresponding rule

In [24]:
col_names = grading0.lhs
col_names.append(grading1.lhs)
col_names = col_names.tolist()

data = np.zeros((len(sel_pat),len(col_names)), dtype=int)

dataframe = pd.DataFrame(data, columns=col_names)
dataframe['patientID'] = sel_pat
dataframe['grading'] = pat_index[pat_index['grading']!='none']['grading'].tolist()
dataframe

Unnamed: 0,"LSR_1,MYO1H_1,POLI_1_homo,APOBEC1_1_homo","LSR_1,GSTM3_1,CHPT1_1,RBM11_1_homo","LSR_1,TMEM40_2,RBM11_1_homo,CFHR4_1_homo,APOBEC1_1_homo","RNF149_1,TMEM40_2,RBM11_1_homo,POLI_1_homo,APOBEC1_1_homo","FNDC9_1,GSTM3_1,TMEM40_2,APOBEC1_1_homo","FHOD3_3,CHPT1_1,TMEM40_2,APOBEC1_1_homo","LSR_1,GSTM3_1,IGFBP2_1,CHPT1_1","LSR_1,IGFBP2_1,RBM11_1_homo,APOBEC1_1_homo,COQ7_1_homo","LSR_1,GJE1_1,RBM11_1_homo,APOBEC1_1_homo","LSR_1,NCOA3_1,TMEM40_2,RBM11_1_homo,APOBEC1_1_homo",...,"LSR_1,TMEM40_2,RBM11_1_homo,POLI_1_homo,CFHR4_1_homo","FHOD3_3,LSR_1,CHPT1_1,RHBDF2_1_homo","LSR_1,MYO1H_1,TMEM40_2,CFHR4_1_homo","GSTM3_1,CFHR4_1_homo,COQ7_1_homo,RHBDF2_1_homo","LSR_1,GSTM3_1,ARHGAP25_1,COQ7_1_homo","LSR_1,MYO1H_1,TMEM40_2,CFHR4_1_homo,APOBEC1_1_homo","IGFBP2_1,ALCAM_2,ARHGAP25_1","LSR_1,MYO1H_1,CFHR4_1_homo,APOBEC1_1_homo,PRSS55_1_homo",patientID,grading
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-11,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-15,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-20,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-23,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,TV-COV-64,0
311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,TV-COV-94,1
312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,TV-COV-95,0
313,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0003,1


In [25]:
for i in range(len(sel_pat)):
    patient = pat_index[pat_index['PatientID']== sel_pat[i]]

    for j in range(len(col_names)):
        genes = col_names[j].split(',')
        if (patient[genes].values == 1).all(axis=1):
            dataframe.at[i, col_names[j]] = 1

In [26]:
dataframe

Unnamed: 0,"LSR_1,MYO1H_1,POLI_1_homo,APOBEC1_1_homo","LSR_1,GSTM3_1,CHPT1_1,RBM11_1_homo","LSR_1,TMEM40_2,RBM11_1_homo,CFHR4_1_homo,APOBEC1_1_homo","RNF149_1,TMEM40_2,RBM11_1_homo,POLI_1_homo,APOBEC1_1_homo","FNDC9_1,GSTM3_1,TMEM40_2,APOBEC1_1_homo","FHOD3_3,CHPT1_1,TMEM40_2,APOBEC1_1_homo","LSR_1,GSTM3_1,IGFBP2_1,CHPT1_1","LSR_1,IGFBP2_1,RBM11_1_homo,APOBEC1_1_homo,COQ7_1_homo","LSR_1,GJE1_1,RBM11_1_homo,APOBEC1_1_homo","LSR_1,NCOA3_1,TMEM40_2,RBM11_1_homo,APOBEC1_1_homo",...,"LSR_1,TMEM40_2,RBM11_1_homo,POLI_1_homo,CFHR4_1_homo","FHOD3_3,LSR_1,CHPT1_1,RHBDF2_1_homo","LSR_1,MYO1H_1,TMEM40_2,CFHR4_1_homo","GSTM3_1,CFHR4_1_homo,COQ7_1_homo,RHBDF2_1_homo","LSR_1,GSTM3_1,ARHGAP25_1,COQ7_1_homo","LSR_1,MYO1H_1,TMEM40_2,CFHR4_1_homo,APOBEC1_1_homo","IGFBP2_1,ALCAM_2,ARHGAP25_1","LSR_1,MYO1H_1,CFHR4_1_homo,APOBEC1_1_homo,PRSS55_1_homo",patientID,grading
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-11,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-15,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-20,1
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,AR-COV-23,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,0,1,1,1,1,0,1,0,0,1,...,1,0,0,0,0,0,0,0,TV-COV-64,0
311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,TV-COV-94,1
312,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,TV-COV-95,0
313,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0003,1


In [27]:
dataframe.to_csv('./2022_results/corrispondenza_paziente_regole_F.csv')

# Rules Summary

## Severity rules summary

In [28]:
df80_1.describe(include="all")

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test,mixed
count,68,68,68.0,68.0,68.0,68.0,68.0,68.0,68
unique,68,1,,,,,,,60
top,"PDE11A_2,FABP1_1,APOBEC1_1_homo,CCDC61_1_homo,...",grading=1,,,,,,,[APOBEC1_1_homo]
freq,1,68,,,,,,,8
mean,,,0.095296,0.837323,1.661459,24.014706,0.114087,9.162909e-09,
std,,,0.011442,0.033474,0.066422,2.883264,0.015167,1.192715e-08,
min,,,0.083333,0.8,1.587402,21.0,0.09127,1.77629e-13,
25%,,,0.087302,0.812078,1.611367,22.0,0.103175,6.344291e-10,
50%,,,0.09127,0.827586,1.64214,23.0,0.111111,8.414474e-09,
75%,,,0.103175,0.847578,1.681809,26.0,0.123016,8.414474e-09,


## Mildness rules summary

In [29]:
df80_0.describe(include="all")

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test,mixed
count,36,36,36.0,36.0,36.0,36.0,36.0,36.0,36
unique,36,1,,,,,,,35
top,"LSR_1,CHPT1_1,TMEM40_2,CFHR4_1_homo,APOBEC1_1_...",grading=0,,,,,,,[PRSS55_1_homo]
freq,1,36,,,,,,,2
mean,,,0.090057,0.834511,1.682375,22.694444,0.108135,5.1418e-09,
std,,,0.00826,0.029877,0.060231,2.081475,0.011594,4.594837e-09,
min,,,0.083333,0.8,1.6128,21.0,0.087302,6.249586e-14,
25%,,,0.083333,0.814815,1.642667,21.0,0.099206,1.036637e-10,
50%,,,0.087302,0.83046,1.674207,22.0,0.103175,6.318788e-09,
75%,,,0.095238,0.841538,1.696542,24.0,0.112103,1.087337e-08,


# Count of positive/negative features

Count of positive features

In [30]:
feature_importance[feature_importance['Feature Importance'] > 0].size

858

Count of negative features

In [31]:
feature_importance[feature_importance['Feature Importance'] < 0].size

484