In [68]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
import preprocessing as pp
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [69]:
new_data = pd.read_csv('data/new_dataset.csv')
msr_data = pd.read_csv('data/msr_dataset.csv', encoding='ANSI')

In [70]:
new_data.head()

Unnamed: 0,token,loc,class
0,subject logger HEADER SYNCHRONOUS setLayout er...,18,1
1,subject logger HEADER SYNCHRONOUS setLayout er...,13,1
2,msg getClass debug getName mp getBody logger s...,11,1
3,"(html)getClass(font color=""red"")(b) (/b)(/font...",11,1
4,msg (\d{2})\w{3} \d{2} \d{2}(:\d{2}){2} [\w.-]...,14,1


In [71]:
msr_data.head()

Unnamed: 0,token,loc,abstract_keyword,assert_keyword,boolean_keyword,break_keyword,byte_keyword,case_keyword,catch_keyword,char_keyword,...,void_keyword,volatile_keyword,while_keyword,true_keyword,null_keyword,false_keyword,const_keyword,goto_keyword,keywordcount,class
0,'test partial frame write assert false i...,14,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,9,0
1,'inherit wf false parent libs2 parent li...,14,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,3,1
2,'nn_host/ get default uri set tet get d...,5,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,4,0
3,'end_ points is_ security_ enabled get c...,13,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,9,1
4,'p1 p2 should fail test client retries ...,24,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,10,0


In [87]:
def preprocess(token):
    token = str(token) 
    token_lowered = token.lower()
    
    token_lowered = re.sub(r'(\brow\b)|(\btable\b)|(\binsert\b)|(\bid\b)', 'dbms', token_lowered)
    
    url_pattern = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    token_url = re.sub(url_pattern, 'urllink', token_lowered)
    
    date_pattern = '([12]\d{3}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01]))|([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))|([12]\d{3}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01]))'
    token_dates = re.sub(date_pattern, 'datetime', token_url)
    
    ip_pattern = '(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])'
    token_ip = re.sub(ip_pattern, ' ipaddrezz ', token_dates)
    
    token_num = re.sub(r'\b[0-9]+\b', 'numeric', token_ip)
    
    special_character_pattern = '[^A-Za-z0-9]+'
    token_char = re.sub(special_character_pattern, ' ', token_num)
    
    return token_char

In [88]:
msr_data['clean_token'] = msr_data['token'].map(lambda s:preprocess(s)) 
new_data['clean_token'] = new_data['token'].map(lambda t:preprocess(t)) 

In [94]:
msr_data[['token', 'clean_token']][1500:1700]

Unnamed: 0,token,clean_token
1500,'set get conf value1 foo conf as list ...,set get conf value1 foo conf as list reconfig...
1501,'set classes to be excluded add record ...,set classes to be excluded add record to bund...
1502,'random output dir: period duration in m...,random output dir period duration in millis r...
1503,'read value get server address ninja tes...,read value get server address ninja test brow...
1504,'to char array assert false length is e...,to char array assert false length is empty cl...
...,...,...
1695,'rows_ one expect all keys in all but ...,rows one expect all keys in all but two rows ...
1696,"'avro test util {\""type\"":\""string\"" \""java...",avro test util type string java class org apa...
1697,'cluster locs racks conf wait for replic...,cluster locs racks conf wait for replication ...
1698,'ninja cache parse duration cache 10s ti...,ninja cache parse duration cache 10s time uti...


In [90]:
y_msr = msr_data['class']
y_new = new_data['class']

X_train, X_test, y_train, y_test = train_test_split(msr_data['clean_token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

In [91]:
text_clf2 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,10))),
    ('model', MultinomialNB())])

text_clf2.fit(X_train, y_train)
preds = text_clf2.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       283
           1       0.94      0.98      0.96       278

    accuracy                           0.96       561
   macro avg       0.96      0.96      0.96       561
weighted avg       0.96      0.96      0.96       561



In [92]:
new_preds = text_clf2.predict(new_data['clean_token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.65      0.44      0.53       724
           1       0.58      0.76      0.66       737

    accuracy                           0.60      1461
   macro avg       0.61      0.60      0.59      1461
weighted avg       0.61      0.60      0.59      1461

f1 0.6600117439812095
