# AskReddit Troll Question Detection Challenge

## Imports

In [1]:
import numpy as np 
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:

import nltk # for tokenizing the paragraphs in sentences and sentences in words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/AskReddit_Dataset/train.csv")

## Preprocessing

### Dropping the qid

In [5]:
train_df.drop(columns=["qid"],inplace=True)
sentences = train_df['question_text'].tolist()

In [6]:
N = 653061
sentences = sentences[0:N]

### Cleaning the data

- Like removing !?., etc.
- converting sentences to lower case

In [None]:
# i=0
# for sentence in sentences:
#     temp = re.sub('[^a-zA-Z0-9]', ' ', sentence)
#     temp = temp.lower()
#     new_sentence = temp.split()
#     new_sentence = ' '.join(new_sentence)
#     sentences[i] = new_sentence
#     # print(new_sentence)
#     i+=1

## Vectoring Words

#### Bag Of Words

In [7]:
# TODO max_features = 1500 may need to be altered
cv = CountVectorizer()
X1 = cv.fit_transform(sentences)

In [8]:
print(type(X1))
print(X1.dtype)
X1 = X1.astype(float)
print(X1.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
int64
float64


In [9]:
Y1 = train_df['target'].to_numpy().astype(np.float64)
Y1 = Y1[:N]

#### TF IDF

In [10]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(sentences)
# print(X2)

In [11]:
print(type(X2))
X2 = X2.astype(float)
print(X2.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
float64


In [12]:
Y2 = Y1

#### Train test split data

In [13]:
from sklearn.model_selection import train_test_split 

train_X1, test_X1, train_y1, test_y1 = train_test_split(X1, Y1, train_size=0.6)

train_X2, test_X2, train_y2, test_y2 = train_test_split(X2, Y2, train_size=0.6)

## Model generation

In [14]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

### Logistic regression

#### For data genrated by "Bag of words" method  

In [15]:
lreg1 = LogisticRegression(solver='liblinear')
lreg1.fit(train_X1,train_y1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

#### For data generated by "TD IDF" method

In [16]:
lreg2 = LogisticRegression(solver='liblinear')
lreg2.fit(train_X2,train_y2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

#### Predict for X1, Y1

In [17]:
train_yhat1 = lreg1.predict_proba(train_X1)
test_yhat1 = lreg1.predict_proba(test_X1)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

threshold = 0.225

train_output1 = (train_yhat1[:,1] > threshold).astype(int)
test_output1 = (test_yhat1[:,1] > threshold).astype(int)

In [18]:
print("training score = ", f1_score(train_y1.astype(int), train_output1))
print("testing score = ", f1_score(test_y1.astype(int), test_output1))

training score =  0.714859135023085
testing score =  0.6119907894362794


#### Predict for X2, Y2

In [19]:
train_yhat2 = lreg2.predict_proba(train_X2)
test_yhat2 = lreg2.predict_proba(test_X2)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

threshold = 0.2

train_output2 = (train_yhat2[:,1] > threshold).astype(int)
test_output2 = (test_yhat2[:,1] > threshold).astype(int)

In [20]:
print("training score = ", f1_score(train_y2.astype(int), train_output2))
print("testing score = ", f1_score(test_y2.astype(int), test_output2))

training score =  0.6497909728619004
testing score =  0.602790094084568


### Bernoulli NB model generation

In [26]:
from sklearn.naive_bayes import BernoulliNB

#### Bag of words

In [27]:
clf1 = BernoulliNB()

# print(type(train_X1))
clf1.fit(train_X1,train_y1)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

#### TD IDF

In [28]:
clf2 = BernoulliNB()
clf2.fit(train_X2,train_y2)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

### BernoulliNB Prediction

#### Bag of words

In [29]:
train_yhat1 = clf1.predict_proba(train_X1)
test_yhat1 = clf1.predict_proba(test_X1)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

threshold = 0.225

train_output1 = (train_yhat1[:,1] > threshold).astype(int)
test_output1 = (test_yhat1[:,1] > threshold).astype(int)

In [30]:
print("training score = ", f1_score(train_y1.astype(int), train_output1))
print("testing score = ", f1_score(test_y1.astype(int), test_output1))

training score =  0.5428327446337772
testing score =  0.5142079292155862


#### TD IDF

In [31]:
train_yhat2 = clf2.predict_proba(train_X2)
test_yhat2 = clf2.predict_proba(test_X2)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

threshold = 0.2

train_output2 = (train_yhat2[:,1] > threshold).astype(int)
test_output2 = (test_yhat2[:,1] > threshold).astype(int)

In [32]:
print("training score = ", f1_score(train_y2.astype(int), train_output2))
print("testing score = ", f1_score(test_y2.astype(int), test_output2))

training score =  0.5469071678156577
testing score =  0.5090531398253937


### Perceptron model generation

In [33]:
from sklearn.linear_model import Perceptron

#### Bag of words

In [34]:
clf1 = Perceptron(tol=1e-3, random_state=0)

# print(type(train_X1))
clf1.fit(train_X1,train_y1)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

#### TD IDF

In [35]:
clf2 = Perceptron(tol=1e-3, random_state=0)
clf2.fit(train_X2,train_y2)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

### Perceptron Prediction

#### Bag of words

In [37]:
train_yhat1 = clf1.predict(train_X1)
test_yhat1 = clf1.predict(test_X1)

print(train_yhat1.shape)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

# threshold = 0.225

# train_output1 = (train_yhat1[:,1] > threshold).astype(int)
# test_output1 = (test_yhat1[:,1] > threshold).astype(int)

(391836,)


In [38]:
print("training score = ", f1_score(train_y1.astype(int), train_yhat1))
print("testing score = ", f1_score(test_y1.astype(int), test_yhat1))

training score =  0.7080045989771241
testing score =  0.5026308181555362


#### TD IDF

In [39]:
train_yhat2 = clf2.predict(train_X2)
test_yhat2 = clf2.predict(test_X2)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

# threshold = 0.2

# train_output2 = (train_yhat2[:,1] > threshold).astype(int)
# test_output2 = (test_yhat2[:,1] > threshold).astype(int)

In [40]:
print("training score = ", f1_score(train_y2.astype(int), train_yhat2))
print("testing score = ", f1_score(test_y2.astype(int), test_yhat2))

training score =  0.6661828167908984
testing score =  0.4793756287918052


In [None]:
# data = [["question_text","target"]]
# for i in range(N):
#   data.append([sentences[i],Y1[i]])

In [None]:
# import csv

# with open('processed_train_data.csv','w',newline='') as fp:
#   a = csv.writer(fp, delimiter=',')
#   a.writerows(data)

#### Saving Model

In [None]:
# import joblib

# joblib.dump(lreg1,'Using Split LReg1 Model')
# joblib.dump(lreg2,'Using Split LReg2 Model')