In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [2]:
msr_data = pd.read_csv('data/msr/raw_msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

In [3]:
new_data = pd.read_csv('data/new/raw_new_dataset.csv')
y_new = new_data['class']
new = new_data.drop(columns=['class'])

### Merging datasets

This approach aims to investigate if the most important features to the model change if both (msr and new) datasets are merged.

In [4]:
merged = pd.DataFrame()
merged['token'] = pd.concat([msr_data['token'], new_data['token']], axis=0)
merged['loc'] = pd.concat([msr_data['loc'], new_data['loc']], axis=0)
merged['class'] = pd.concat([msr_data['class'], new_data['class']], axis=0)

merged_dropped = merged.drop(columns='class')
y_merge = merged['class']

X_train, X_test, y_train, y_test = train_test_split(merged_dropped['token'], y_merge, 
                                                    train_size=0.8, random_state=33,
                                                    shuffle=True)

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
                      ('vectorizer', CountVectorizer(stop_words='english')),
                      ('model', MultinomialNB())])

merged_dropped['class'] = y_merge
merged_shuffled = merged_dropped.sample(frac=1)

In [12]:
scores = cross_val_score(text_clf, merged_shuffled['token'], merged_shuffled['class'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

Accuracy: 0.90 (+/- 0.03)
[0.88745604 0.8898007  0.9144197  0.88511137 0.91324736]


In [13]:
text_clf.fit(X_train, y_train)
preds = text_clf.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89       443
           1       0.89      0.86      0.88       410

    accuracy                           0.88       853
   macro avg       0.88      0.88      0.88       853
weighted avg       0.88      0.88      0.88       853



In [19]:
def show_most_informative_features(vectorizer, clf, n=100):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        #print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        print(coef_2, fn_2)
        
show_most_informative_features(text_clf['vectorizer'], text_clf['model'])

-4.117594170779464 job
-4.171218639895159 test
-4.240872549235573 action
-4.409403254347518 assert
-4.427169535912808 coord
-4.578788568818287 file
-4.7976744459564316 set
-4.989641108749589 workflow
-5.007919635922204 add
-5.061771130732794 assertequals
-5.065877912685447 xml
-5.089824918535958 conf
-5.14840999887103 id
-5.198821588572511 equals
-5.204714357539663 create
-5.220200074979043 table
-5.220200074979043 2009
-5.3014439839982135 oozie
-5.30535788331935 services
-5.327830739171408 wf
-5.37715580573391 app
-5.381378190613758 coordinator
-5.411444879212542 start
-5.459072928201797 fail
-5.504287176512965 status
-5.52850143463356 time
-5.568506769247259 value
-5.592604320826319 data
-5.619084243990787 path
-5.644447922482642 bundle
-5.674245285287586 record
-5.678033168604523 asserttrue
-5.679932504408176 service
-5.720675287870526 execute
-5.738657790420958 close
-5.744724543103196 fs
-5.750828326041214 dir
-5.7610848262084025 size
-5.788254731560331 true
-5.790375623129468 end