In [30]:
import pandas as pd
import numpy as np
import re
from time import time
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ashish/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df = pd.read_csv('tweets_f234_users_vader_bert_url_len_lang.csv')

In [5]:
df = df[df['lang'] == 'en']

In [64]:
df

Unnamed: 0,userid,account_type,clean_tweet,bert_clear_expression_conf,lang,vader_sentiment,vader_label,url_flag,len
0,787405734442958848,bot,"We can put the ""River"" in ""Screwdrivers"" https...",90,en,0.0000,neutral,True,64
1,787405734442958848,bot,"You can't extract the ""Pong"" from ""Sponged"" ht...",98,en,0.0000,neutral,True,67
2,787405734442958848,bot,"Put the ""Nope"" back in ""Inoperative"" https://t...",97,en,0.0000,neutral,True,60
3,787405734442958848,bot,"We can't spell ""Artworks"" without ""Two"" https:...",85,en,0.0000,neutral,True,63
4,787405734442958848,bot,"You can be the ""She"" in ""Astonished"" https://t...",51,en,0.0000,neutral,True,60
...,...,...,...,...,...,...,...,...,...
488632,3077047801,bot,A Tribute – Joan Rivers on Fitness http://t.co...,69,en,0.2732,positive,True,74
488633,3077047801,bot,Live in Orlando area? Want a personal trainer?...,87,en,0.1680,positive,False,143
488634,3077047801,bot,If you're trying to get fit keep working. You...,83,en,0.8608,positive,True,138
488635,3077047801,bot,Ready for summer? Cheap cool sunglasses - loo...,96,en,0.7430,positive,True,133


In [8]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [10]:
%%time
nparray = sbert_model.encode(df['clean_tweet'].values)

# CPU times: user 18h 28s, sys: 9min 51s, total: 18h 10min 19s
# Wall time: 9h 7min 48s

CPU times: user 18h 28s, sys: 9min 51s, total: 18h 10min 19s
Wall time: 9h 7min 48s


In [11]:
clf = RandomForestClassifier(random_state=0)

In [13]:
df_embeddings = pd.DataFrame(nparray)

In [20]:
df_embeddings.to_csv('tweet_embeddings_using_bert_f234_' + str(round(time())) + '.csv', index = False)

In [None]:
%%time
temp_df = pd.read_csv('tweet_embeddings_using_bert_f234_1664047300.csv')

In [21]:
print(nparray.shape)
print(df_embeddings.shape)

(334679, 768)
(334679, 768)


In [73]:
X_original_features = df[['vader_sentiment', 'url_flag', 'len', 'bert_clear_expression_conf']]
y = df['account_type']

In [74]:
%%time

# CPU times: user 28 s, sys: 6.76 s, total: 34.8 s
# Wall time: 14.1 s

pca = PCA(n_components=8) 

# Data with n_components=64 takes infinite time (> 3 mins) to train a RandomForestClassifier.
# Data with n_components=32 takes infinite time (> 3 mins) to train a RandomForestClassifier.
# Data with n_components=16 takes infinite time (> 3 mins) to train a RandomForestClassifier.

df_embeddings_pca = pca.fit_transform(df_embeddings)

CPU times: user 29.7 s, sys: 4.53 s, total: 34.2 s
Wall time: 12 s


In [78]:
print(type(df_embeddings_pca))
df_embeddings_pca = pd.DataFrame(df_embeddings_pca)
print(df_embeddings_pca.shape)
print(X_original_features.shape)

<class 'pandas.core.frame.DataFrame'>
(334679, 8)
(334679, 4)


In [81]:
for i in X_original_features.columns:
    df_embeddings_pca[i] = X_original_features[i].values

In [83]:
print(df_embeddings_pca.shape)

(334679, 12)


In [86]:
X_train, X_test, y_train, y_test = train_test_split(df_embeddings_pca, y, test_size=0.33, random_state=42)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.33, random_state=42)

In [88]:
%%time
clf = clf.fit(X_train, y_train)

CPU times: user 1min 56s, sys: 81.1 ms, total: 1min 56s
Wall time: 1min 56s


In [89]:
pred = clf.predict(X_test)

In [90]:
labels = ['bot', 'human']

In [91]:
print(classification_report(y_test, y_pred = pred, labels = labels))

              precision    recall  f1-score   support

         bot       0.85      0.48      0.62     23603
       human       0.87      0.98      0.92     86842

    accuracy                           0.87    110445
   macro avg       0.86      0.73      0.77    110445
weighted avg       0.87      0.87      0.86    110445

