In [7]:
import pandas as pd

# http://rasbt.github.io/mlxtend/
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

Ucitavanje podataka o ripitima

In [3]:
data = pd.read_csv('pravila/t_cell_ripiti_dn_3_pravila.csv', header=None)

In [4]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,P0DTC1.1,190,192,343,345,3,FCG,14,34,0,0,0,0,0
1,P0DTC1.1,1047,1049,3176,3178,3,EEA,376,1271,0,0,0,0,0
2,P0DTC1.1,3456,3459,4155,4158,4,AAGT,30,52,0,0,0,0,0
3,P0DTC1.1,565,567,2344,2346,3,AAI,974,3412,1,0,1,0,0
4,P0DTC1.1,565,567,2784,2786,3,AAI,974,3412,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140224,SUJ59222.1,740,742,883,885,3,VWN,4,12,0,0,0,0,0
3140225,SUJ59222.1,740,742,751,753,3,VWN,4,12,0,0,0,0,0
3140226,SUJ59222.1,795,797,883,885,3,VWN,4,12,0,0,0,0,0
3140227,SUJ59222.1,751,753,795,797,3,VWN,4,12,0,0,0,0,0


Semplovanje i izdvajanje ciljnih atributa

In [5]:
subset = data.iloc[:,[6, 9]].sample(n=200000, replace=False).replace({ 0: 'IND=0', 1: 'IND=1' })
subset

Unnamed: 0,6,9
2816035,SLL,IND=0
2558522,AAAA,IND=1
1399528,SRASSR,IND=0
104382,DDK,IND=0
1716195,LAAD,IND=0
...,...,...
2196224,MTI,IND=0
613732,AGL,IND=0
266466,ITV,IND=0
1236600,PTG,IND=0


In [6]:
subset.values

array([['SLL', 'IND=0'],
       ['AAAA', 'IND=1'],
       ['SRASSR', 'IND=0'],
       ...,
       ['ITV', 'IND=0'],
       ['PTG', 'IND=0'],
       ['RERLYWELS', 'IND=0']], dtype=object)

Transformacija atributa u matricu čiji redovi odgovaraju epitopima a kolone pojedinačnim ripitima i indikatorima. Presek reda $i$ i kolone $j$ sadrži indikator da li ripit $j$ preseca (na bilo koji način) epitop $j$

In [8]:
te = TransactionEncoder()
item_set = te.fit_transform(subset.values)

Transformacija dobijene matrice u DataFrame objekat, radi bolje preglednosti

In [9]:
item_set_df = pd.DataFrame(data=item_set, columns=te.columns_)
item_set_df

Unnamed: 0,AAA,AAAA,AAAAA,AAAAAA,AAAAAAAA,AAAAAAAAA,AAAAAAAAAA,AAAAAAAAAAAA,AAAAAAAKAA,AAAAAKAA,...,YYC,YYCL,YYDP,YYFRV,YYFRVTA,YYG,YYL,YYP,YYR,YYS
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
199998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Izdvajanje čestih skupova pomoću FPGrowth algoritma

In [10]:
sets = fpgrowth(
    item_set_df, 
    min_support=0.0001, 
    use_colnames=True, 
    max_len=None, 
    verbose=0
)

In [11]:
sets

Unnamed: 0,support,itemsets
0,0.917405,(IND=0)
1,0.000815,(SLL)
2,0.082595,(IND=1)
3,0.000765,(AAAA)
4,0.000985,(PEV)
...,...,...
3626,0.000110,"(TSK, IND=0)"
3627,0.000110,"(GDS, IND=0)"
3628,0.000215,"(IND=0, GTL)"
3629,0.000105,"(SGQ, IND=0)"


Pronalaženje pravila pridruživanja na osnovu čestih skupova

In [12]:
rules = association_rules(sets, metric='confidence', min_threshold=0.5)

In [13]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(SLL),(IND=0),0.000815,0.917405,0.000750,0.920245,1.003096,0.000002,1.035614
1,(AAAA),(IND=0),0.000765,0.917405,0.000585,0.764706,0.833553,-0.000117,0.351029
2,(PEV),(IND=0),0.000985,0.917405,0.000980,0.994924,1.084498,0.000076,16.271215
3,(AGT),(IND=0),0.000495,0.917405,0.000435,0.878788,0.957906,-0.000019,0.681409
4,(GPPG),(IND=0),0.005430,0.917405,0.004995,0.919890,1.002708,0.000013,1.031013
...,...,...,...,...,...,...,...,...,...
1704,(TSK),(IND=0),0.000125,0.917405,0.000110,0.880000,0.959227,-0.000005,0.688292
1705,(GDS),(IND=0),0.000135,0.917405,0.000110,0.814815,0.888174,-0.000014,0.446013
1706,(GTL),(IND=0),0.000250,0.917405,0.000215,0.860000,0.937427,-0.000014,0.589964
1707,(SGQ),(IND=0),0.000105,0.917405,0.000105,1.000000,1.090031,0.000009,inf


Izdvajanje pravila koja sadrže indikator 1

In [14]:
for row in rules.values:
    if 'IND=1' in row[1]:
        print(row[:2], f'L={row[6]}')

[frozenset({'QPQ'}) frozenset({'IND=1'})] L=6.053635207942369
[frozenset({'RRRR'}) frozenset({'IND=1'})] L=7.387487033421197
[frozenset({'QPQQP'}) frozenset({'IND=1'})] L=6.787409172541444
