In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
import arff
import pandas as pd
import re

def weka_tokenizer(doc):
    delimiters_regexp = re.compile("[ |\n|\f|\r|\t|.|,|;|:|'|\"|(|)|?|!]")
    # delimiters_regexp = re.compile("[ -\/:-@\[-\`{-~|0-9|\n|\f|\r|\t|\s]")
    return list(filter(None, delimiters_regexp.split(doc)))

In [2]:
data = pd.read_csv('../../datasets/flakies_rq_22.csv')

df = pd.DataFrame(data, columns=['tokens', 'loc', 'abstract_keyword', 'assert_keyword', 'boolean_keyword', 'break_keyword', 'byte_keyword', 'case_keyword', 'catch_keyword', 'char_keyword', 'class_keyword', 'continue_keyword', 'default_keyword', 'do_keyword', 'double_keyword', 'else_keyword', 'enum_keyword', 'exports_keyword', 'extends_keyword', 'final_keyword', 'finally_keyword', 'float_keyword', 'for_keyword', 'if_keyword', 'implements_keyword', 'import_keyword', 'instanceof_keyword', 'int_keyword', 'interface_keyword', 'long_keyword', 'modules_keyword', 'native_keyword', 'new_keyword', 'package_keyword', 'private_keyword', 'protected_keyword', 'public_keyword', 'requires_keyword', 'return_keyword', 'short_keyword', 'static_keyword', 'strictfp_keyword', 'super_keyword', 'switch_keyword', 'synchronized_keyword', 'this_keyword', 'throw_keyword', 'throws_keyword', 'transient_keyword', 'try_keyword', 'void_keyword', 'volatile_keyword', 'while_keyword', 'true_keyword', 'null_keyword', 'false_keyword', 'const_keyword', 'goto_keyword', 'keywordcount', 'klass'])

y = df['klass']

vectorizer = CountVectorizer(analyzer='word', max_features=1551, tokenizer=weka_tokenizer) 
bowToken = vectorizer.fit_transform(df['tokens'])

bowData = pd.DataFrame(bowToken.toarray(), columns=vectorizer.get_feature_names())
df.drop('tokens', axis=1, inplace=True)
df.drop('klass', axis=1, inplace=True)
df = df.join(bowData)

informationGain = dict(zip(df.columns, mutual_info_classif(df, y, discrete_features=True)))

df = df.join(y)

sortedInformationGain = sorted(informationGain, key=informationGain.get, reverse=True)

In [3]:
i = 0
sortedInformationGainPosition = []

for r in sortedInformationGain:    

    infGain = {
        'position': i, 
        'token': r, 
        'information_gain': informationGain[r], 
        'total_ocurences': len(df[ df[r] > 0 ] ), 
        'total_flaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Flakey") ]), 
        'total_nonflaky_occurences': len(df[ (df[r] > 0)  & (df['klass'] == "Non_Flakey") ])
    }

    sortedInformationGainPosition.append(infGain)
    i += 1

    print(i, r, informationGain[r])
    

infGainCSV = pd.DataFrame(sortedInformationGainPosition, columns=['position', 'token', 'information_gain', 'total_ocurences', 'total_flaky_occurences', 'total_nonflaky_occurences'])
infGainCSV.to_csv('information_gain_rq_22.csv')

1 getname 8.881784197001252e-16
2 namingcontext 8.881784197001252e-16
3 same 8.881784197001252e-16
4 await 8.673617379884035e-16
5 export 8.673617379884035e-16
6 return_keyword 8.465450562766819e-16
7 be 8.465450562766819e-16
8 handler 8.465450562766819e-16
9 is 8.465450562766819e-16
10 this_keyword 8.396061623727746e-16
11 protocol 8.396061623727746e-16
12 } 8.396061623727746e-16
13 collections 8.187894806610529e-16
14 context 8.187894806610529e-16
15 server 8.118505867571457e-16
16 task 8.049116928532385e-16
17 of 7.632783294297951e-16
18 taskpayloadbuilder 7.632783294297951e-16
19 url 6.938893903907228e-16
20 try_keyword 6.661338147750939e-16
21 class_keyword 5.412337245047638e-16
22 assertequals 3.885780586188048e-16
23 add 8.326672684688674e-17
24 result 8.326672684688674e-17
25 false_keyword 5.551115123125783e-17
26 assert 5.551115123125783e-17
27 build 5.551115123125783e-17
28 fail 5.551115123125783e-17
29 lookup 5.551115123125783e-17
30 ok 5.551115123125783e-17
31 with 5.551115