In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import preprocessing as pp
import pandas as pd

### Training naive bayes model on MSR data

In [6]:
msr_data = pd.read_csv('data/msr/raw_msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

In [7]:
text_clf1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', MultinomialNB())])

text_clf1.fit(X_train, y_train)
preds = text_clf1.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       283
           1       0.96      0.94      0.95       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



### Evaluating on new data 

In [9]:
new_data = pd.read_csv('data/new/raw_new_dataset.csv')
#new_data['processed_token'] = pp.preprocess_tokens(new_data)
y_new = new_data['class']

In [10]:
new_preds = text_clf1.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.63      0.53      0.58       724
           1       0.60      0.70      0.65       737

    accuracy                           0.62      1461
   macro avg       0.62      0.62      0.61      1461
weighted avg       0.62      0.62      0.61      1461

f1 0.647834274952919


In [11]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        #print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        print(coef_2, fn_2)
        
show_most_informative_features(text_clf1['vectorizer'], text_clf1['model'])

-3.446652136990842 get
-4.21493784039351 job
-4.233519456105646 test
-4.334343530754384 action
-4.473751928844439 coord
-4.5306234397990455 assert
-4.624385764792657 file
-4.630706898356028 to
-4.8511590841574455 name
-4.893134576375016 set
-4.965785053261062 the
-5.101344844146977 not
-5.120822651106549 xml
-5.127053200857184 workflow
-5.173970120644936 conf
-5.178346495244735 add
-5.190481227710729 2009
-5.205014742326896 should
-5.228947508538525 id
-5.273766330183771 create
-5.300662817049117 equals
-5.315644370664734 for
-5.360668052038689 oozie
-5.37391327878871 services
-5.390042660718594 table
-5.395477456704551 app
-5.447201074558542 coordinator
-5.450074639754274 is
-5.451514524812262 wf
-5.460197647385723 assertequals
-5.495704335842633 start
-5.548266665933488 time
-5.551446318850868 in
-5.573991557171924 status
-5.585457694259568 data
-5.637878824623175 path
-5.676821798571782 fail
-5.689537682897085 system
-5.691367509574161 value
-5.728685272581356 have
-5.74401124305958

The smaller the better.

### Testing SGDClassifier

In [12]:
text_clf2 = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('model', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))])

text_clf2.fit(X_train, y_train)
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.55      0.48      0.51       724
           1       0.54      0.61      0.58       737

    accuracy                           0.54      1461
   macro avg       0.54      0.54      0.54      1461
weighted avg       0.54      0.54      0.54      1461

f1 0.5752551020408163
