In [24]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import preprocessing as pp
import pandas as pd

### Training naive bayes model on MSR data

In [25]:
msr_data = pd.read_csv('data/msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

In [26]:
text_clf1 = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('model', MultinomialNB())])

text_clf1.fit(X_train, y_train)
preds = text_clf1.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       283
           1       0.93      0.79      0.86       278

    accuracy                           0.87       561
   macro avg       0.88      0.87      0.87       561
weighted avg       0.88      0.87      0.87       561



### Evaluating on new data 

In [27]:
new_data = pd.read_csv('data/new_dataset.csv')
#new_data['processed_token'] = pp.preprocess_tokens(new_data)
y_new = new_data['class']

In [28]:
new_preds = text_clf1.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.60      0.70      0.65       724
           1       0.65      0.53      0.59       737

    accuracy                           0.62      1461
   macro avg       0.62      0.62      0.62      1461
weighted avg       0.62      0.62      0.61      1461

f1 0.585003711952487


In [29]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        #print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        print(coef_2, fn_2)
        
show_most_informative_features(text_clf1['vectorizer'], text_clf1['model'])

-3.130079605284166 get
-3.8983653086868344 job
-3.91694692439897 test
-4.017770999047708 action
-4.157179397137763 coord
-4.21405090809237 assert
-4.307813233085981 file
-4.314134366649352 to
-4.53458655245077 name
-4.57656204466834 set
-4.649212521554386 the
-4.784772312440301 not
-4.804250119399873 xml
-4.810480669150508 workflow
-4.857397588938261 conf
-4.8617739635380595 add
-4.873908696004053 2009
-4.88844221062022 should
-4.912374976831849 id
-4.957193798477095 create
-4.984090285342441 equals
-4.999071838958058 for
-5.0440955203320135 oozie
-5.057340747082034 services
-5.073470129011918 table
-5.078904924997875 app
-5.130628542851866 coordinator
-5.133502108047598 is
-5.134941993105586 wf
-5.143625115679047 assertequals
-5.179131804135957 start
-5.231694134226812 time
-5.234873787144192 in
-5.257419025465248 status
-5.268885162552892 data
-5.321306292916499 path
-5.3602492668651065 fail
-5.372965151190409 system
-5.374794977867485 value
-5.41211274087468 have
-5.427438711352908 

The smaller the better.

### Testing SGDClassifier

In [30]:
text_clf2 = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('model', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))])

text_clf2.fit(X_train, y_train)
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.55      0.48      0.51       724
           1       0.54      0.61      0.58       737

    accuracy                           0.54      1461
   macro avg       0.54      0.54      0.54      1461
weighted avg       0.54      0.54      0.54      1461

f1 0.5752551020408163
