In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import preprocessing as pp
import pandas as pd

### Training naive bayes model on MSR data

In [3]:
msr_data = pd.read_csv('data/msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

In [10]:
text_clf1 = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=['job', 'oozie', 'id', 'services', 'action', 'table',
                                              'xml', 'get', 'workflow', 'getid', 'cluster', 'wf',
                                              'service', 'coord'])),
    ('model', MultinomialNB())])

text_clf1.fit(X_train, y_train)
preds = text_clf1.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       283
           1       0.95      0.94      0.95       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



### Evaluating on new data 

In [11]:
new_data = pd.read_csv('data/new_dataset.csv')
#new_data['processed_token'] = pp.preprocess_tokens(new_data)
y_new = new_data['class']

In [12]:
new_preds = text_clf1.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.64      0.50      0.56       724
           1       0.60      0.72      0.65       737

    accuracy                           0.61      1461
   macro avg       0.62      0.61      0.61      1461
weighted avg       0.62      0.61      0.61      1461

f1 0.6531862745098039


In [13]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        #print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        print(coef_2, fn_2)
        
show_most_informative_features(text_clf1['vectorizer'], text_clf1['model'])

-4.113309646938952 test
-4.410413630632352 assert
-4.504175955625963 file
-4.510497089189334 to
-4.730949274990752 name
-4.772924767208322 set
-4.845575244094368 the
-4.981135034980284 not
-5.053760311478243 conf
-5.058136686078042 add
-5.070271418544035 2009
-5.0848049331602025 should
-5.1535565210170775 create
-5.180453007882424 equals
-5.195434561498041 for
-5.2752676475378575 app
-5.3269912653918485 coordinator
-5.329864830587581 is
-5.339987838219029 assertequals
-5.375494526675939 start
-5.428056856766794 time
-5.431236509684174 in
-5.45378174800523 status
-5.465247885092874 data
-5.517669015456481 path
-5.556611989405089 fail
-5.5693278737303915 system
-5.5711577004074675 value
-5.6084754634146625 have
-5.62380143389289 and
-5.655176556460644 record
-5.655176556460644 bundle
-5.697909406046612 dir
-5.708358821920952 true
-5.7253083802347255 fs
-5.729591042026727 02
-5.7360496220661386 been
-5.738211785070634 call
-5.742550186669232 execute
-5.746907492038187 end
-5.7578845506693

The smaller the better.

### Testing SGDClassifier

In [30]:
text_clf2 = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('model', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))])

text_clf2.fit(X_train, y_train)
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.55      0.48      0.51       724
           1       0.54      0.61      0.58       737

    accuracy                           0.54      1461
   macro avg       0.54      0.54      0.54      1461
weighted avg       0.54      0.54      0.54      1461

f1 0.5752551020408163
