In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import scipy as sp
import matplotlib.pyplot as plt
import re
from os import listdir
from os.path import isfile, join
from scipy import stats
from matplotlib.ticker import MaxNLocator

import warnings
warnings.filterwarnings('ignore')

# Import Male rules Datasets 

In [2]:
df = pd.read_csv("./2022_results/rules_males_full.csv")

Formatting the dataset properly

In [3]:
#Split the string by reading the left hand and the right hand sides 
df['lhs'] = df['rules'].apply(lambda x : re.search(r"\{(.+)\} ", x).group(1))
df['rhs'] = df['rules'].apply(lambda x : re.search(r" \{(.+)\}", x).group(1))

#drop the rules column
df.drop(columns='rules',inplace = True)
#sort the columns differently 
df = df[['lhs','rhs','support','confidence','lift','count','coverage','pvalue_test']]

df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)
df['lhs'] = df['lhs'].str.replace('=1','')
df['lhs'] = df['lhs'].str.replace('KRTAP27.1_1','KRTAP27-1_1')
df['count'] = df['count'].astype('int32')
df = df.sort_values(by=['support'], ascending=False)

Selecting only the severity rules with the chosen minsup and minconf

In [4]:
minconf_1 = 0.8
minsup_1 = 0.08

df[(df['confidence']>=minconf_1) 
                   & (df['support']>=minsup_1)
                   & (df['rhs']=='grading=1')
                  ]

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test
7958,"TDRD5_3,OTOP2_1,FAM118A_1,LY6D_1",grading=1,0.112601,0.823529,1.861676,42,0.136729,4.103819e-09
7957,"TDRD5_3,TFRC_1,FAM118A_1,LY6D_1",grading=1,0.107239,0.816327,1.845393,40,0.131367,4.103819e-09
6833,"FGD2_1,TLR3_1,PSKH2_1,FOXR2_1_hemi",grading=1,0.107239,0.8,1.808485,40,0.134048,7.813064e-08
5498,"MUC5AC_1,TFRC_1,MTERF4_2,RBBP6_2",grading=1,0.099196,0.804348,1.818314,37,0.123324,3.209897e-10
7976,"MSH4_1,LRRC18_1,APOBR_1,IL17RC_1_homo",grading=1,0.096515,0.8,1.808485,36,0.120643,3.396862e-09
6555,"ADAM15_1,LRRC18_1,OTOP2_1,GCAT_2",grading=1,0.096515,0.818182,1.849587,36,0.117962,3.979115e-11
5496,"MUC5AC_1,GORAB_1,MTERF4_2,RBBP6_2",grading=1,0.093834,0.853659,1.929786,35,0.10992,3.396862e-09
6839,"MSH4_1,ADAM15_1,FGD2_1,IL17RC_1_homo",grading=1,0.093834,0.813953,1.840028,35,0.115282,8.702226e-16
6189,"MSH4_1,TBPL2_1,TFRC_1,PCDHB15_2",grading=1,0.093834,0.813953,1.840028,35,0.115282,6.502453e-09
3474,"COQ3_2,SELP_1_homo,IL17RC_1_homo,FIGLA_1_hemi",grading=1,0.091153,0.809524,1.830014,34,0.112601,6.502453e-09


Selecting only the mildness rules with the chosen minconf and minsup

In [5]:
minconf_0 = 0.8
minsup_0 = 0.12

df[(df['confidence']>=minconf_0) 
                   & (df['support']>=minsup_0)
                   & (df['rhs']=='grading=0')
                  ]

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test
9706,"PPM1E_1,KRTAP27-1_1,OTOGL_1,CPO_1",grading=0,0.179625,0.807229,1.447579,67,0.222520,3.329449e-09
9312,"PPM1E_1,N4BP2_1,SHANK3_1,KRTAP27-1_1",grading=0,0.174263,0.812500,1.457031,65,0.214477,3.358246e-10
9311,"PPM1E_1,N4BP2_1,KRTAP27-1_1,PNPT1_1",grading=0,0.171582,0.800000,1.434615,64,0.214477,5.778461e-10
9704,"PPM1E_1,OTOGL_1,CPO_1,PAX4_1_homo",grading=0,0.171582,0.810127,1.452775,64,0.211796,1.366436e-12
9705,"KRTAP27-1_1,OTOGL_1,CPO_1,PNPT1_1",grading=0,0.171582,0.831169,1.490509,64,0.206434,1.548974e-07
...,...,...,...,...,...,...,...,...
8840,"MIA3_1,DNAH6_1,TTLL4_1,NLRP6_1",grading=0,0.120643,0.818182,1.467220,45,0.147453,1.464687e-10
9512,"OR56A4_1,STON2_3,AKAP10_1,PAX4_1_homo",grading=0,0.120643,0.803571,1.441020,45,0.150134,6.878659e-08
7360,"TTLL4_1,ITIH2_1,FSIP2_2_homo,TENT5D_1_hemi",grading=0,0.120643,0.818182,1.467220,45,0.147453,1.859191e-08
6447,"PPM1E_1,ANO9_1,SLC25A5_1_hemi,IBTK_1_hemi",grading=0,0.120643,0.818182,1.467220,45,0.147453,3.325741e-09


# Importing severity and mildness genes

In [7]:
feature_importance = pd.read_csv('./Data/Male/relevant genes in males.csv', sep=';')

In [8]:
feature_importance['complete'] = feature_importance['complete'].astype('str')
feature_importance.loc[feature_importance['Type']=='Common Homozygous']['complete'] = feature_importance[feature_importance['Type']=='Common Homozygous']['complete'].apply(lambda x :  x +'_homo')
feature_importance.loc[feature_importance['Type']=='Common Hemizygous']['complete'] = feature_importance[feature_importance['Type']=='Common Hemizygous']['complete'].apply(lambda x :  x +'_hemi')
feature_importance['Feature Importance'] = feature_importance['Feature Importance'].str.replace(',','.')
feature_importance['Feature Importance'] = pd.to_numeric(feature_importance['Feature Importance'],errors='coerce')

In [9]:
feature_importance.complete = feature_importance.apply(lambda row: row['complete'] + '_hemi' if row['Type'] == 'Common Hemizygous' else row['complete'], axis=1)

In [10]:
sus = feature_importance[feature_importance['Feature Importance']>=0]['complete'].tolist()
#sus = sus +['IRX5_1']

In [11]:
prot = feature_importance[feature_importance['Feature Importance']<0]['complete']

In [12]:
df80_1 = df[(df['confidence']>=minconf_1) 
                                          & (df['rhs']=='grading=1') 
                                          & (df['support']>=minsup_1)]

df80_1['mixed'] = df80_1.lhs.apply(lambda x : np.setdiff1d(x.split(','),sus))

Checking which severity rules have mixed genes

In [13]:
count= 0
for i in range(len(df80_1)):
    if df80_1.iloc[i]['mixed'].size > 0:
        print(df80_1.iloc[i]['mixed'])
        count+= 1
        
print("\n")
print(count, "mixed rules have been found in",len(df80_1),"severity rules.")

['PNPT1_1']


1 mixed rules have been found in 39 severity rules.


Checking which mildness rules have mixed genes

In [14]:
df80_0 = df[(df['confidence']>=minconf_0) 
                                          & (df['rhs']=='grading=0') 
                                          & (df['support']>=minsup_0)]

df80_0['mixed'] = df80_0.lhs.apply(lambda x : np.setdiff1d(x.split(','),prot))

In [15]:
count= 0
for i in range(len(df80_0)):
    if df80_0.iloc[i]['mixed'].size > 0:
        print(df80_0.iloc[i]['mixed'])
        count+= 1
print("\n")
print(count, "mixed rules have been found in",len(df80_0),"mildness rules.")

['FOXR2_1_hemi']
['FOXR2_1_hemi']
['FOXR2_1_hemi']
['FOXR2_1_hemi']
['FOXR2_1_hemi']


5 mixed rules have been found in 156 mildness rules.


# TABELLA CON RISPOSTA AL COVID E SODDISFAZIONE DELLE REGOLE

Considero sempre come regole quelle con i valori di min_sup e min_conf stabiliti

In [24]:
pat = pd.read_csv('./Data/AssociationRules/All_Male_bool.csv')

In [25]:
pat0 = pat[pat['grading']==0]

In [26]:
grading0 = df[((df['confidence']>=minconf_0) 
                                          & (df['rhs']=='grading=0') 
                                          & (df['support']>=minsup_0))]

grading0['mixed'] = grading0.lhs.apply(lambda x : np.setdiff1d(x.split(','),prot))

grading0['lhs'] = grading0['lhs'].str.replace('.','-')
#grading_adj_age0['lhs'] = grading_adj_age0['lhs'].str.replace('KRTAP5;10','KRTAP5-10')

In [27]:
pat1 = pat[pat['grading']==1]

In [28]:
grading1 = df[((df['confidence']>=minconf_1) 
                                          & (df['rhs']=='grading=1') 
                                          & (df['support']>=minsup_1))]

grading1['mixed'] = grading1.lhs.apply(lambda x : np.setdiff1d(x.split(','),sus))

grading1['lhs'] = grading1['lhs'].str.replace('.','-')
#grading_adj_age0['lhs'] = grading_adj_age0['lhs'].str.replace('KRTAP5;10','KRTAP5-10')

In [30]:
pat_index = pd.read_csv('./Data/AssociationRules/All_Male_bool_index.csv')

In [31]:
#nella tabella riporto i soli pazienti con un grading maggiore o minore di covid
sel_pat = pat_index[pat_index['grading']!='none']['PatientID'].tolist()

### Building a dataframe with all the selected male patients along the rows and all the found rules (protection and severity) along the columns.

Each cell will contain a 1 if a patient satisfies the corresponding rule

In [32]:
col_names = grading0.lhs
col_names.append(grading1.lhs)
col_names = col_names.tolist()

data = np.zeros((len(sel_pat),len(col_names)), dtype=int)

dataframe = pd.DataFrame(data, columns=col_names)
dataframe['patientID'] = sel_pat
dataframe['grading'] = pat_index[pat_index['grading']!='none']['grading'].tolist()
dataframe

Unnamed: 0,"PPM1E_1,KRTAP27-1_1,OTOGL_1,CPO_1","PPM1E_1,N4BP2_1,SHANK3_1,KRTAP27-1_1","PPM1E_1,N4BP2_1,KRTAP27-1_1,PNPT1_1","PPM1E_1,OTOGL_1,CPO_1,PAX4_1_homo","KRTAP27-1_1,OTOGL_1,CPO_1,PNPT1_1","PPM1E_1,NLRP6_1,SWT1_2,TENT5D_1_hemi","MIA3_1,NLRP6_1,PNPT1_1,HOXA4_1_homo","NLRP6_1,ITIH2_1,LINS1_3,PNPT1_1","NLRP6_1,SWT1_2,SLC25A5_1_hemi,TENT5D_1_hemi","OTOGL_1,CPO_1,PNPT1_1,PAX4_1_homo",...,"TTLL4_1,SWT1_2,HOXA4_1,POF1B_1_hemi","PPM1E_1,OR56A4_1,TMEM221_2_homo,POF1B_1_hemi","CYP4F2_3,NLRP6_1,STON2_3,SLC25A5_1_hemi","MIA3_1,DNAH6_1,TTLL4_1,NLRP6_1","OR56A4_1,STON2_3,AKAP10_1,PAX4_1_homo","TTLL4_1,ITIH2_1,FSIP2_2_homo,TENT5D_1_hemi","PPM1E_1,ANO9_1,SLC25A5_1_hemi,IBTK_1_hemi","N4BP2_1,SYT16_1,CPO_1,ITIH2_1",patientID,grading
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-10,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-13,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-14,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-18,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,AR-COV-19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0002,0
463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0004,0
464,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0005,0
465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,VE10,0


In [33]:
for i in range(len(sel_pat)):
    patient = pat_index[pat_index['PatientID']== sel_pat[i]]

    for j in range(len(col_names)):
        genes = col_names[j].split(',')
        if (patient[genes].values == 1).all(axis=1):
            dataframe.at[i, col_names[j]] = 1

In [34]:
dataframe

Unnamed: 0,"PPM1E_1,KRTAP27-1_1,OTOGL_1,CPO_1","PPM1E_1,N4BP2_1,SHANK3_1,KRTAP27-1_1","PPM1E_1,N4BP2_1,KRTAP27-1_1,PNPT1_1","PPM1E_1,OTOGL_1,CPO_1,PAX4_1_homo","KRTAP27-1_1,OTOGL_1,CPO_1,PNPT1_1","PPM1E_1,NLRP6_1,SWT1_2,TENT5D_1_hemi","MIA3_1,NLRP6_1,PNPT1_1,HOXA4_1_homo","NLRP6_1,ITIH2_1,LINS1_3,PNPT1_1","NLRP6_1,SWT1_2,SLC25A5_1_hemi,TENT5D_1_hemi","OTOGL_1,CPO_1,PNPT1_1,PAX4_1_homo",...,"TTLL4_1,SWT1_2,HOXA4_1,POF1B_1_hemi","PPM1E_1,OR56A4_1,TMEM221_2_homo,POF1B_1_hemi","CYP4F2_3,NLRP6_1,STON2_3,SLC25A5_1_hemi","MIA3_1,DNAH6_1,TTLL4_1,NLRP6_1","OR56A4_1,STON2_3,AKAP10_1,PAX4_1_homo","TTLL4_1,ITIH2_1,FSIP2_2_homo,TENT5D_1_hemi","PPM1E_1,ANO9_1,SLC25A5_1_hemi,IBTK_1_hemi","N4BP2_1,SYT16_1,CPO_1,ITIH2_1",patientID,grading
0,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,AR-COV-10,1
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,AR-COV-13,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,AR-COV-14,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,AR-COV-18,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,AR-COV-19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,USCASI0002,0
463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,USCASI0004,0
464,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,USCASI0005,0
465,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,VE10,0


In [None]:
dataframe.to_csv('./2022_results/corrispondenza_paziente_regole_M.csv')

# Rules Summary

## Severity rules summary

In [35]:
df80_1.describe(include="all")

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test,mixed
count,39,39,39.0,39.0,39.0,39.0,39.0,39.0,39
unique,39,1,,,,,,,39
top,"LRRC18_1,APOBR_1,IL17RC_1_homo,TGIF2LX_1_hemi",grading=1,,,,,,,[]
freq,1,39,,,,,,,1
mean,,,0.089022,0.819282,1.852074,33.205128,0.108751,3.356786e-08,
std,,,0.007562,0.018657,0.042175,2.820783,0.010009,4.445283e-08,
min,,,0.080429,0.8,1.808485,30.0,0.096515,8.702226e-16,
25%,,,0.08311,0.807201,1.824763,31.0,0.100536,3.396862e-09,
50%,,,0.088472,0.815789,1.844179,33.0,0.107239,6.502453e-09,
75%,,,0.091153,0.824265,1.863338,34.0,0.112601,7.813064e-08,


## Mildness rules summary

In [36]:
df80_0.describe(include="all")

Unnamed: 0,lhs,rhs,support,confidence,lift,count,coverage,pvalue_test,mixed
count,156,156,156.0,156.0,156.0,156.0,156.0,156.0,156
unique,156,1,,,,,,,153
top,"PPM1E_1,N4BP2_1,TMEM221_2_homo,SLC25A5_1_hemi",grading=0,,,,,,,[FOXR2_1_hemi]
freq,1,156,,,,,,,3
mean,,,0.134272,0.815849,1.463036,50.083333,0.164673,5.152348e-08,
std,,,0.013063,0.014715,0.026388,4.872515,0.016727,6.519008e-08,
min,,,0.120643,0.8,1.434615,45.0,0.142091,7.723587e-20,
25%,,,0.123324,0.803571,1.44102,46.0,0.150134,5.778461e-10,
50%,,,0.131367,0.811203,1.454704,49.0,0.160858,1.086155e-08,
75%,,,0.13941,0.821429,1.473043,52.0,0.174263,1.173577e-07,


# Count of positive/negative features

Count of positive features

In [44]:
feature_importance[feature_importance['Feature Importance'] > 0].shape[0]

135

Count of negative features

In [43]:
feature_importance[feature_importance['Feature Importance'] < 0].shape[0]

158

In [47]:
feature_importance[feature_importance['Type'] == 'rare_AD']

array(['Common Haplotypes', 'Common Homozygous', 'rare_AD', 'rare_AR',
       'rare Hemizygous', 'Common Hemizygous'], dtype=object)

In [49]:
feature_importance[feature_importance['Type'] == 'rare_AD']['complete']

119     MICAL1_rare_AD
120      SGSM2_rare_AD
121    LRRC14B_rare_AD
122       NOD2_rare_AD
123      TICRR_rare_AD
124        PSD_rare_AD
125      ASXL1_rare_AD
126      RBM20_rare_AD
127    PLEKHA4_rare_AD
128      VPS16_rare_AD
129      DUOX2_rare_AD
Name: complete, dtype: object