In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

import regex as re

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score

In [2]:
df = pd.read_csv('/content/drive/MyDrive/technocolab final project/troll_data.csv',index_col='Unnamed: 0')

In [4]:
df.columns

Index(['content', 'troll'], dtype='object')

In [5]:
df.head()

Unnamed: 0,content,troll
3670,💩,1
15401,why would hillary clinton—who holds public and...,0
8048,manni syd this is my late son killed by illega...,0
6923,awesome,1
21263,crazy a yr old child who can t pay a small fin...,0


In [7]:
# Pre-Processing
df['content'] = df['content'].apply(lambda x: x.lower())
# removing special chars
df['content'] = df['content'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
df['content'] = df['content'].str.replace('rt','')
df.head()

Unnamed: 0,content,troll
3670,,1
15401,why would hillary clintonwho holds public and ...,0
8048,manni syd this is my late son killed by illega...,0
6923,awesome,1
21263,crazy a yr old child who can t pay a small fin...,0


In [11]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=0.75, min_df=5, max_features=10000)
tfidf = tfidf_vectorizer.fit_transform(df['content'] )
X=tfidf
y = df['troll'].astype(int)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [14]:
clf = RandomForestClassifier()
clf.fit(X_train_tfidf,y_train)
y_preds = clf.predict(X_test_tfidf)
report = classification_report( y_test, y_preds )
print(report)
acc=accuracy_score(y_test,y_preds)
print("Random Classifier, Accuracy Score:" , acc)

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      5094
           1       0.84      0.87      0.85      1419

    accuracy                           0.93      6513
   macro avg       0.90      0.91      0.91      6513
weighted avg       0.94      0.93      0.94      6513

Random Classifier, Accuracy Score: 0.9348994319054199


In [15]:
clf = SVC(verbose=True)
clf.fit(X_train_tfidf,y_train)
y_preds = clf.predict(X_test_tfidf)
report = classification_report( y_test, y_preds )
print(report)
acc=accuracy_score(y_test,y_preds)
print("Support Vector Machine, Accuracy Score:" , acc)

[LibSVM]              precision    recall  f1-score   support

           0       0.95      0.97      0.96      5094
           1       0.88      0.80      0.84      1419

    accuracy                           0.93      6513
   macro avg       0.92      0.89      0.90      6513
weighted avg       0.93      0.93      0.93      6513

Support Vector Machine, Accuracy Score: 0.9339781974512513


## LSTM model

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [17]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['content'].values)
X = tokenizer.texts_to_sequences(df['content'].values)

In [18]:
# Padding
X = pad_sequences(X)

Y = pd.get_dummies(df['troll']).values

In [19]:
# Train/Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(26051, 48) (26051, 2)
(6513, 48) (6513, 2)


In [20]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 48, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 48, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
batch_size = 512
NoOfEpochs = 5
model.fit(X_train, Y_train, epochs = NoOfEpochs, batch_size=batch_size, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8da3e3b4e0>

In [29]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print(classification_report(df_test.true, df_test.pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      5094
           1       0.82      0.84      0.83      1419

    accuracy                           0.93      6513
   macro avg       0.89      0.89      0.89      6513
weighted avg       0.93      0.93      0.93      6513

