In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
import arff
import pandas as pd
import re

def weka_tokenizer(doc):
    delimiters_regexp = re.compile("[ |\n|\f|\r|\t|.|,|;|:|'|\"|(|)|?|!]")
    # delimiters_regexp = re.compile("[ -\/:-@\[-\`{-~|0-9|\n|\f|\r|\t|\s]")
    return list(filter(None, delimiters_regexp.split(doc)))

In [2]:
data = pd.read_csv('../../datasets/flakies_rq_21.csv')

df = pd.DataFrame(data, columns=['tokens', 'loc', 'abstract_keyword', 'assert_keyword', 'boolean_keyword', 'break_keyword', 'byte_keyword', 'case_keyword', 'catch_keyword', 'char_keyword', 'class_keyword', 'continue_keyword', 'default_keyword', 'do_keyword', 'double_keyword', 'else_keyword', 'enum_keyword', 'exports_keyword', 'extends_keyword', 'final_keyword', 'finally_keyword', 'float_keyword', 'for_keyword', 'if_keyword', 'implements_keyword', 'import_keyword', 'instanceof_keyword', 'int_keyword', 'interface_keyword', 'long_keyword', 'modules_keyword', 'native_keyword', 'new_keyword', 'package_keyword', 'private_keyword', 'protected_keyword', 'public_keyword', 'requires_keyword', 'return_keyword', 'short_keyword', 'static_keyword', 'strictfp_keyword', 'super_keyword', 'switch_keyword', 'synchronized_keyword', 'this_keyword', 'throw_keyword', 'throws_keyword', 'transient_keyword', 'try_keyword', 'void_keyword', 'volatile_keyword', 'while_keyword', 'true_keyword', 'null_keyword', 'false_keyword', 'const_keyword', 'goto_keyword', 'keywordcount', 'klass'])

y = df['klass']

vectorizer = CountVectorizer(analyzer='word', max_features=1551, tokenizer=weka_tokenizer) 
bowToken = vectorizer.fit_transform(df['tokens'])

bowData = pd.DataFrame(bowToken.toarray(), columns=vectorizer.get_feature_names())
df.drop('tokens', axis=1, inplace=True)
df.drop('klass', axis=1, inplace=True)
df = df.join(bowData)

informationGain = dict(zip(df.columns, mutual_info_classif(df, y, discrete_features=True)))

df = df.join(y)

sortedInformationGain = sorted(informationGain, key=informationGain.get, reverse=True)

In [3]:
i = 0
sortedInformationGainPosition = []

for r in sortedInformationGain:    

    infGain = {
        'position': i, 
        'token': r, 
        'information_gain': informationGain[r], 
        'total_ocurences': len(df[ df[r] > 0 ] ), 
        'total_flaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Flakey") ]), 
        'total_nonflaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Non_Flakey") ])
    }

    sortedInformationGainPosition.append(infGain)
    i += 1

    print(i, r, informationGain[r])
    

infGainCSV = pd.DataFrame(sortedInformationGainPosition, columns=['position', 'token', 'information_gain', 'total_ocurences', 'total_flaky_occurences', 'total_nonflaky_occurences'])
infGainCSV.to_csv('information_gain_rq_21.csv')

1 public_keyword 8.187894806610529e-16
2 acl 8.187894806610529e-16
3 created 8.187894806610529e-16
4 reader 8.187894806610529e-16
5 createdirwithhttp 7.91033905045424e-16
6 createsnapshot 7.91033905045424e-16
7 directory 7.91033905045424e-16
8 folder 7.91033905045424e-16
9 gethadoopusers 7.91033905045424e-16
10 hadoopusersconftesthelper 7.91033905045424e-16
11 init 7.91033905045424e-16
12 no 7.91033905045424e-16
13 touri 7.91033905045424e-16
14 write 7.91033905045424e-16
15 assertnotnull 1.1102230246251565e-16
16 loc 0.0
17 abstract_keyword 0.0
18 assert_keyword 0.0
19 boolean_keyword 0.0
20 break_keyword 0.0
21 byte_keyword 0.0
22 case_keyword 0.0
23 catch_keyword 0.0
24 char_keyword 0.0
25 class_keyword 0.0
26 continue_keyword 0.0
27 default_keyword 0.0
28 do_keyword 0.0
29 double_keyword 0.0
30 else_keyword 0.0
31 enum_keyword 0.0
32 exports_keyword 0.0
33 extends_keyword 0.0
34 final_keyword 0.0
35 finally_keyword 0.0
36 float_keyword 0.0
37 for_keyword 0.0
38 if_keyword 0.0
39 imp