In [54]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTENC
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.ensemble import IsolationForest

In [55]:
virus_df = pd.read_csv('drive/MyDrive/ColabNotebooks/UMA/virus_df_full_only_age.csv').drop(['Country', 
                                                                                            'Age_Child', 
                                                                                            'Age_Adult',
                                                                                            'Age_Infant',
                                                                                            'Age_Elderly',
                                                                                            'Age_Mix',
                                                                                            'Health_Status', 
                                                                                            'Archaeal', 
                                                                                            'Bacteriophage', 
                                                                                            'Eukaryotic', 
                                                                                            'crassphage'], axis=1)


In [56]:
virus_df['Age'] = virus_df['Age'].map({'Child': 0, 'Infant': 1, 'Adult': 2, 'Elderly': 4, 'Mix': 3})

In [57]:
virus_df.head()

Unnamed: 0,Age,Healthy,Ackermannviridae,Adenoviridae,Alphaflexiviridae,Anelloviridae,Asfarviridae,Astroviridae,Bicaudaviridae,Caliciviridae,...,Papillomaviridae,Parvoviridae,Picornaviridae,Podoviridae,Polyomaviridae,Poxviridae,Rudiviridae,Siphoviridae,Smacoviridae,Virgaviridae
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.14,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1.0,0.0,0.0,0.0,3.84,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.84,0.0,0.0,0.0,3.84,0.0,0.0
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
def calculate_outliers(data):
    q25 = data.quantile(0.25)
    q75 = data.quantile(0.75)
    iqr = q75 - q25
    boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
    return boundaries

In [59]:
def set_outliers_to_boundary(df):

    X = df.drop('Age', axis=1)

    isof = IsolationForest(random_state=42, contamination=0.05)
    isof.fit(X)

    outlier_preds = isof.predict(X)

    for i, pred in enumerate(outlier_preds):
        if pred == -1:
            upper_boundary = calculate_outliers(X.iloc[i])
            X.iloc[i] = X.iloc[i].clip(upper=upper_boundary[1])
    
    cleaned_df = pd.concat([X, df['Age']], axis=1)

    return cleaned_df

In [60]:
virus_df.head()

Unnamed: 0,Age,Healthy,Ackermannviridae,Adenoviridae,Alphaflexiviridae,Anelloviridae,Asfarviridae,Astroviridae,Bicaudaviridae,Caliciviridae,...,Papillomaviridae,Parvoviridae,Picornaviridae,Podoviridae,Polyomaviridae,Poxviridae,Rudiviridae,Siphoviridae,Smacoviridae,Virgaviridae
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.14,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1.0,0.0,0.0,0.0,3.84,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.84,0.0,0.0,0.0,3.84,0.0,0.0
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
target = 'Healthy'

X = virus_df.loc[:,virus_df.columns!=target]
y = virus_df.loc[:,virus_df.columns==target]

In [62]:
oversample = SMOTENC(categorical_features=[0], random_state=42)
X, y = oversample.fit_resample(X, y)

In [63]:
# imp_viruses = ['Myoviridae', 'Siphoviridae', 'Podoviridae', 'Microviridae']
X = set_outliers_to_boundary(X)



In [64]:
X.head()

Unnamed: 0,Ackermannviridae,Adenoviridae,Alphaflexiviridae,Anelloviridae,Asfarviridae,Astroviridae,Bicaudaviridae,Caliciviridae,Circoviridae,Cruliviridae,...,Parvoviridae,Picornaviridae,Podoviridae,Polyomaviridae,Poxviridae,Rudiviridae,Siphoviridae,Smacoviridae,Virgaviridae,Age
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8.14,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,3.84,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.84,0.0,0.0,0.0,3.84,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [65]:
selector = SelectKBest(score_func=f_regression, k=10)
selector.fit(X, y)

selected_columns = X.columns[selector.get_support()]
print(selected_columns)

Index(['Adenoviridae', 'Circoviridae', 'Cruliviridae', 'Herpesviridae',
       'Microviridae', 'Myoviridae', 'Picornaviridae', 'Podoviridae',
       'Rudiviridae', 'Siphoviridae'],
      dtype='object')


  y = column_or_1d(y, warn=True)


In [66]:
X = X[selected_columns]

In [67]:
X.head()

Unnamed: 0,Adenoviridae,Circoviridae,Cruliviridae,Herpesviridae,Microviridae,Myoviridae,Picornaviridae,Podoviridae,Rudiviridae,Siphoviridae
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.14,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.84,0.0,3.84,0.0,3.84
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
virus_df = pd.concat([X, y], axis=1)

In [69]:
virus_df[virus_df > 0.0] = 1

In [70]:
virus_df.head()

Unnamed: 0,Adenoviridae,Circoviridae,Cruliviridae,Herpesviridae,Microviridae,Myoviridae,Picornaviridae,Podoviridae,Rudiviridae,Siphoviridae,Healthy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [71]:
virus_df.sum()

Adenoviridae        25.0
Circoviridae       184.0
Cruliviridae         8.0
Herpesviridae      146.0
Microviridae      1466.0
Myoviridae        1812.0
Picornaviridae      32.0
Podoviridae       2125.0
Rudiviridae          5.0
Siphoviridae      2227.0
Healthy           1338.0
dtype: float64

In [72]:
virus_df[virus_df['Healthy']==1.0].sum()

Adenoviridae         1.0
Circoviridae        51.0
Cruliviridae         8.0
Herpesviridae      110.0
Microviridae       647.0
Myoviridae         977.0
Picornaviridae      23.0
Podoviridae       1046.0
Rudiviridae          0.0
Siphoviridae      1142.0
Healthy           1338.0
dtype: float64

In [73]:
apriori_df = apriori(virus_df, min_support = 0.1, use_colnames = True)

In [74]:
apriori_df

Unnamed: 0,support,itemsets
0,0.547833,(Microviridae)
1,0.67713,(Myoviridae)
2,0.794096,(Podoviridae)
3,0.832212,(Siphoviridae)
4,0.5,(Healthy)
5,0.390508,"(Myoviridae, Microviridae)"
6,0.501121,"(Microviridae, Podoviridae)"
7,0.495142,"(Siphoviridae, Microviridae)"
8,0.241779,"(Healthy, Microviridae)"
9,0.620329,"(Myoviridae, Podoviridae)"


In [75]:
df_ar = association_rules(apriori_df, min_threshold = 0.1)

In [76]:
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Myoviridae),(Microviridae),0.677130,0.547833,0.390508,0.576711,1.052714,0.019554,1.068223
1,(Microviridae),(Myoviridae),0.547833,0.677130,0.390508,0.712824,1.052714,0.019554,1.124293
2,(Microviridae),(Podoviridae),0.547833,0.794096,0.501121,0.914734,1.151919,0.066090,2.414846
3,(Podoviridae),(Microviridae),0.794096,0.547833,0.501121,0.631059,1.151919,0.066090,1.225581
4,(Siphoviridae),(Microviridae),0.832212,0.547833,0.495142,0.594971,1.086045,0.039229,1.116382
...,...,...,...,...,...,...,...,...,...
175,(Healthy),"(Siphoviridae, Microviridae, Myoviridae, Podov...",0.500000,0.355007,0.168535,0.337070,0.949474,-0.008969,0.972943
176,(Siphoviridae),"(Healthy, Myoviridae, Microviridae, Podoviridae)",0.832212,0.171151,0.168535,0.202515,1.183251,0.026101,1.039328
177,(Podoviridae),"(Healthy, Siphoviridae, Microviridae, Myoviridae)",0.794096,0.182362,0.168535,0.212235,1.163815,0.023722,1.037922
178,(Myoviridae),"(Healthy, Siphoviridae, Microviridae, Podoviri...",0.677130,0.202541,0.168535,0.248896,1.228868,0.031388,1.061716


In [77]:
df_ar_healthy = df_ar[df_ar['consequents']==frozenset({'Healthy'})]
df_ar_healthy

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(Microviridae),(Healthy),0.547833,0.5,0.241779,0.441337,0.882674,-0.032138,0.894994
13,(Myoviridae),(Healthy),0.67713,0.5,0.365097,0.539183,1.078366,0.026532,1.08503
17,(Podoviridae),(Healthy),0.794096,0.5,0.390882,0.492235,0.984471,-0.006166,0.984708
19,(Siphoviridae),(Healthy),0.832212,0.5,0.426756,0.512797,1.025595,0.01065,1.026267
34,"(Myoviridae, Microviridae)",(Healthy),0.390508,0.5,0.185351,0.474641,0.949282,-0.009903,0.95173
46,"(Microviridae, Podoviridae)",(Healthy),0.501121,0.5,0.21562,0.430276,0.860552,-0.03494,0.877618
52,"(Siphoviridae, Microviridae)",(Healthy),0.495142,0.5,0.223094,0.450566,0.901132,-0.024477,0.910027
64,"(Myoviridae, Podoviridae)",(Healthy),0.620329,0.5,0.328849,0.53012,1.060241,0.018685,1.064103
70,"(Siphoviridae, Myoviridae)",(Healthy),0.660688,0.5,0.358744,0.542986,1.085973,0.028401,1.094059
76,"(Siphoviridae, Podoviridae)",(Healthy),0.744768,0.5,0.372571,0.500251,1.000502,0.000187,1.000502


In [78]:
df_ar_healthy['support'] = df_ar_healthy['support']/df_ar_healthy['consequent support']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ar_healthy['support'] = df_ar_healthy['support']/df_ar_healthy['consequent support']


In [79]:
df_ar_healthy

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(Microviridae),(Healthy),0.547833,0.5,0.483558,0.441337,0.882674,-0.032138,0.894994
13,(Myoviridae),(Healthy),0.67713,0.5,0.730194,0.539183,1.078366,0.026532,1.08503
17,(Podoviridae),(Healthy),0.794096,0.5,0.781764,0.492235,0.984471,-0.006166,0.984708
19,(Siphoviridae),(Healthy),0.832212,0.5,0.853513,0.512797,1.025595,0.01065,1.026267
34,"(Myoviridae, Microviridae)",(Healthy),0.390508,0.5,0.370703,0.474641,0.949282,-0.009903,0.95173
46,"(Microviridae, Podoviridae)",(Healthy),0.501121,0.5,0.431241,0.430276,0.860552,-0.03494,0.877618
52,"(Siphoviridae, Microviridae)",(Healthy),0.495142,0.5,0.446188,0.450566,0.901132,-0.024477,0.910027
64,"(Myoviridae, Podoviridae)",(Healthy),0.620329,0.5,0.657698,0.53012,1.060241,0.018685,1.064103
70,"(Siphoviridae, Myoviridae)",(Healthy),0.660688,0.5,0.717489,0.542986,1.085973,0.028401,1.094059
76,"(Siphoviridae, Podoviridae)",(Healthy),0.744768,0.5,0.745142,0.500251,1.000502,0.000187,1.000502


In [80]:
df_ar_healthy[df_ar_healthy['lift']>1.05]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
13,(Myoviridae),(Healthy),0.67713,0.5,0.730194,0.539183,1.078366,0.026532,1.08503
64,"(Myoviridae, Podoviridae)",(Healthy),0.620329,0.5,0.657698,0.53012,1.060241,0.018685,1.064103
70,"(Siphoviridae, Myoviridae)",(Healthy),0.660688,0.5,0.717489,0.542986,1.085973,0.028401,1.094059
139,"(Siphoviridae, Myoviridae, Podoviridae)",(Healthy),0.606502,0.5,0.649477,0.535428,1.070856,0.021487,1.07626
