In [None]:
import sys, os, re, gzip, json, pickle, shutil, random, joblib

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data_path = '../data'

user_feature_path = data_path + "/user_features.pkl"
troll_feature_path = data_path + "/troll_features.pkl"

In [None]:
def getDataFromPickle(path):
    f = open(path, 'rb')
    return pickle.load(f)

In [None]:
user_feature_list = getDataFromPickle(user_feature_path)
troll_feature_list = getDataFromPickle(troll_feature_path)

In [None]:
combine = user_feature_list[:500000] + troll_feature_list[:5000000]

In [None]:
random.shuffle(combine)
y = [0 if f["type"] == 'NormalUser' else 1 for f in combine]

In [None]:
with gzip.open('../data/1000000_features_x.pkl.gz', 'wb') as oz:
    pickle.dump(combine, oz)
    
with gzip.open('../data/1000000_features_y.pkl', 'wb') as o:
    pickle.dump(y, o)

In [None]:
lemmas = [f["lemmas"] for f in combine]
texts = [f["tokens"] for f in combine]

In [None]:
combine=user_feature_list=troll_feature_list=None

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=50000, stop_words="english")

tfidf = tfidf_vectorizer.fit_transform(texts)

joblib.dump(tfidf, '1000000_tfidf_texts.joblib') 

In [None]:
x_tfidf_train = tfidf[:-10000,:]
y_tfidf_train = np.array(y[:-10000])
x_tfidf_test = tfidf[-10000:,:]
y_tfidf_test = np.array(y[-10000:])

In [None]:
svm = LinearSVC(dual = False)
svm.fit(x_tfidf_train, y_tfidf_train)
print(svm.score(x_tfidf_test, y_tfidf_test))  

In [None]:
predicted = cross_val_predict(svm, x_tfidf_test, y_tfidf_test, cv=2)

In [None]:
print(classification_report(y_tfidf_test, predicted, target_names=['NormalUser', 'TrollUser']))

In [None]:
cm = confusion_matrix(y_tfidf_test, predicted)
cmd = ConfusionMatrixDisplay(cm, display_labels=['NormalUser', 'TrollUser'])
cmd.plot()

In [None]:
feature_names = tfidf_vectorizer.get_feature_names() 
coefs_with_fns = sorted(zip(svm.coef_[0], feature_names)) 
df = pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

In [None]:
print(df.head(20))

In [None]:
print(df.tail(20))

In [None]:
lr = LogisticRegression(max_iter=2000)
lr.fit(x_tfidf_train, y_tfidf_train)
print(lr.score(x_tfidf_test, y_tfidf_test))  

In [None]:
feature_names = tfidf_vectorizer.get_feature_names() 
coefs_with_fns = sorted(zip(lr.coef_[0], feature_names)) 
df = pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

In [None]:
tfidf_vectorizer_lemma = TfidfVectorizer(max_features=50000, stop_words="english")

tfidf_lemma = tfidf_vectorizer_lemma.fit_transform(lemmas)

joblib.dump(tfidf, '1000000_tfidf_lemmas.joblib') 

In [None]:
x_tfidf_train_lemma = tfidf_lemma[:-10000,:]
y_tfidf_train_lemma = np.array(y[:-10000])
x_tfidf_test_lemma = tfidf_lemma[-10000:,:]
y_tfidf_test_lemma = np.array(y[-10000:])

In [None]:
svm_lemma = LinearSVC(dual = False)
svm_lemma.fit(x_tfidf_train_lemma, y_tfidf_train_lemma)
print(svm_lemma.score(x_tfidf_test_lemma, y_tfidf_test_lemma))  

In [None]:
predicted_lemma = cross_val_predict(svm_lemma, x_tfidf_test_lemma, y_tfidf_test_lemma, cv=2)

In [None]:
print(classification_report(y_tfidf_test_lemma, predicted_lemma, target_names=['NormalUser', 'TrollUser']))

In [None]:
cm_lemma = confusion_matrix(y_tfidf_test_lemma, predicted_lemma)
cmd_lemma = ConfusionMatrixDisplay(cm_lemma, display_labels=['NormalUser', 'TrollUser'])
cmd_lemma.plot()

In [None]:
feature_names_lemma = tfidf_vectorizer_lemma.get_feature_names() 
coefs_with_fns_lemma = sorted(zip(svm_lemma.coef_[0], feature_names_lemma)) 
df_lemma = pd.DataFrame(coefs_with_fns_lemma)
df_lemma.columns='coefficient','word'
df_lemma.sort_values(by='coefficient')

In [None]:
print(df_lemma.head(20))

In [None]:
print(df_lemma.tail(20))

In [None]:
lr_lemma = LogisticRegression(max_iter=2000)
lr_lemma.fit(x_tfidf_train_lemma, y_tfidf_train_lemma)
print(lr.score(x_tfidf_test_lemma, y_tfidf_test_lemma)) 

In [None]:
feature_names_lemma = tfidf_vectorizer_lemma.get_feature_names() 
coefs_with_fns_lemma = sorted(zip(lr_lemma.coef_[0], feature_names_lemma)) 
df_lemma = pd.DataFrame(coefs_with_fns)
df_lemma.columns='coefficient','word'
df_lemma.sort_values(by='coefficient')