In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [3]:
msr_data = pd.read_csv('../msr_results/tokenslockeywordskeywordcount.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

In [4]:
new_data = pd.read_csv('../new_data/dataset.csv')
y_new = new_data['class']
new = new_data.drop(columns=['class'])

### Merging datasets

In [5]:
merged = pd.DataFrame()
merged['token'] = pd.concat([msr_data['token'], new_data['token']], axis=0)
merged['loc'] = pd.concat([msr_data['loc'], new_data['loc']], axis=0)
merged['class'] = pd.concat([msr_data['class'], new_data['class']], axis=0)

merged_dropped = merged.drop(columns='class')
y_merge = merged['class']

X_train, X_test, y_train, y_test = train_test_split(merged_dropped, y_merge, train_size=0.8, 
                                                    random_state=33, shuffle=True)

vectorizer = CountVectorizer(max_features=1000)
X_train_c = vectorizer.fit_transform(X_train['token'])
X_test_c = vectorizer.transform(X_test['token'])

merged_model = RandomForestClassifier(n_estimators=100)
merged_model.fit(X_train_c, y_train)
preds = merged_model.predict(X_test_c)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       443
           1       0.93      0.89      0.91       410

    accuracy                           0.92       853
   macro avg       0.92      0.92      0.92       853
weighted avg       0.92      0.92      0.92       853



In [6]:
importances = pd.DataFrame(merged_model.feature_importances_, index = vectorizer.get_feature_names(),
                            columns=['importance']).sort_values('importance',ascending=False)

importances[:10]

Unnamed: 0,importance
table,0.023458
job,0.022046
test,0.021412
action,0.017885
xml,0.0138
services,0.013083
assert,0.012921
mtfs,0.012631
cluster,0.012088
getfilesystem,0.011129
