In [22]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer

In [20]:
exem = pd.DataFrame(pd.read_csv('exemplary_comments.csv')['comment_body'])
aver = pd.DataFrame(pd.read_csv('average_comments.csv')['comment_body'])
exem['target'] = 1
aver['target'] = 0
all = pd.concat([exem, aver])

In [32]:
text = all['comment_body']
y = all['target']

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tagged = []
X = []
max_len = 0

for item in text:
    tagged.append('[CLS] ' + item.strip() + ' [SEP]')

for item in tagged:
    tokenized_text = tokenizer.tokenize(item) 
    if len(tokenized_text) > max_len:
        max_len = len(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    X.append(indexed_tokens)

for j in range(len(X)):
    padding = [0] * (max_len - len(X[j]))
    X[j] += padding
    
X = pd.DataFrame(X).iloc[:,:50]

In [34]:
### Grid Search ###
def gridSearch(model, param):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # make train test dataset
    gscv = GridSearchCV(model, param, scoring='f1', n_jobs=-1) # use cross validation to search the best parameters
    gscv.fit(X_train, y_train)
    y_predict = gscv.predict(X_test) # predict the result
    cm = confusion_matrix(y_test, y_predict) # get the confusion matrics
    print('Confusion Matrix:\n', cm, '\n', 'best parameters:', gscv.best_params_)

In [38]:
rf = RandomForestClassifier()    # create random forest model
rf_param = {'min_samples_split': range(2,7,2),       # parameters for grid search
              'n_estimators': range(50,151,20)}

gridSearch(rf, rf_param)

Confusion Matrix:
 [[186  62]
 [ 44 252]] 
 best parameters: {'min_samples_split': 6, 'n_estimators': 70}


In [37]:
gb = GradientBoostingClassifier()    # create gradient boosting model
gb_param = {'min_samples_split': range(2,7,2),   # parameters for grid search
              'n_estimators': range(50,151,20)}

gridSearch(gb, gb_param)

Confusion Matrix:
 [[189  59]
 [ 47 249]] 
 best parameters: {'min_samples_split': 2, 'n_estimators': 70}


In [39]:
### Use best parameters found on the grid search to build models ###
svc = SVC(gamma='auto')
rf = RandomForestClassifier(min_samples_split=6, n_estimators=70)
gb = GradientBoostingClassifier(min_samples_split=2, n_estimators=70)
voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
voting = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('svc', svc)], voting='hard')
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

Confusion Matrix:
 [[182  66]
 [ 41 255]]
