In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
import arff
import pandas as pd
import re

def weka_tokenizer(doc):
    delimiters_regexp = re.compile("[ |\n|\f|\r|\t|.|,|;|:|'|\"|(|)|?|!]")
    # delimiters_regexp = re.compile("[ -\/:-@\[-\`{-~|0-9|\n|\f|\r|\t|\s]")
    return list(filter(None, delimiters_regexp.split(doc)))

In [2]:
data = arff.load("../../datasets/MSR4FlakinessOriginal.arff")

df = pd.DataFrame(data, columns=['tokens', 'loc', 'abstract_keyword', 'assert_keyword', 'boolean_keyword', 'break_keyword', 'byte_keyword', 'case_keyword', 'catch_keyword', 'char_keyword', 'class_keyword', 'continue_keyword', 'default_keyword', 'do_keyword', 'double_keyword', 'else_keyword', 'enum_keyword', 'exports_keyword', 'extends_keyword', 'final_keyword', 'finally_keyword', 'float_keyword', 'for_keyword', 'if_keyword', 'implements_keyword', 'import_keyword', 'instanceof_keyword', 'int_keyword', 'interface_keyword', 'long_keyword', 'modules_keyword', 'native_keyword', 'new_keyword', 'package_keyword', 'private_keyword', 'protected_keyword', 'public_keyword', 'requires_keyword', 'return_keyword', 'short_keyword', 'static_keyword', 'strictfp_keyword', 'super_keyword', 'switch_keyword', 'synchronized_keyword', 'this_keyword', 'throw_keyword', 'throws_keyword', 'transient_keyword', 'try_keyword', 'void_keyword', 'volatile_keyword', 'while_keyword', 'true_keyword', 'null_keyword', 'false_keyword', 'const_keyword', 'goto_keyword', 'keywordcount', 'klass'])

y = df['klass']

vectorizer = CountVectorizer(analyzer='word', max_features=1500, tokenizer=weka_tokenizer) 
bowToken = vectorizer.fit_transform(df['tokens'])

bowData = pd.DataFrame(bowToken.toarray(), columns=vectorizer.get_feature_names())
df.drop('tokens', axis=1, inplace=True)
df.drop('klass', axis=1, inplace=True)
df = df.join(bowData)

informationGain = dict(zip(df.columns, mutual_info_classif(df, y, discrete_features=True)))

df = df.join(y)

sortedInformationGain = sorted(informationGain, key=informationGain.get, reverse=True)

In [3]:
i = 0
sortedInformationGainPosition = []

for r in sortedInformationGain:    

    infGain = {
        'position': i, 
        'token': r, 
        'information_gain': informationGain[r], 
        'total_ocurences': len(df[ df[r] > 0 ] ), 
        'total_flaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Flakey") ]), 
        'total_nonflaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Non_Flakey") ])
    }

    sortedInformationGainPosition.append(infGain)
    i += 1

    print(i, r, informationGain[r])
    


infGainCSV = pd.DataFrame(sortedInformationGainPosition, columns=['position', 'token', 'information_gain', 'total_ocurences', 'total_flaky_occurences', 'total_nonflaky_occurences'])
infGainCSV.to_csv('information_gain_rq_13.csv')

1 job 0.14491500049749653
2 table 0.1029796587876006
3 id 0.1004319891651803
4 services 0.09776092568455451
5 action 0.09721660905697455
6 oozie 0.09424476112449412
7 loc 0.08793936976295982
8 coord 0.08263782720878801
9 xml 0.07528721680908017
10 getid 0.07462750408637384
11 coordinator 0.07418817836510425
12 get 0.06913188837222863
13 workflow 0.06333959802931788
14 throws_keyword 0.06156855531411635
15 getstatus 0.0613196196267442
16 record 0.05969417540133884
17 service 0.059092626070019125
18 jpa 0.05411552613636775
19 jpaservice 0.05218170354304141
20 wf 0.04998120143983045
21 coordinatorjob 0.047790824376541696
22 call 0.04777561167742202
23 getfilesystem 0.04728787446811885
24 case 0.04693691237083236
25 coordinatoraction 0.04289899445863557
26 app 0.04237018377976755
27 execute 0.04031521206350207
28 system 0.038775062036519474
29 addrecordtocoordjobtable 0.03859244700863951
30 keywordcount 0.03852093132358067
31 assert 0.03560658332123451
32 node 0.035532751866200724
33 evalu