# AskReddit Troll Question Detection Challenge

## Imports

In [1]:
import numpy as np 
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:

import nltk # for tokenizing the paragraphs in sentences and sentences in words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/archit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/archit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/archit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
train_df = pd.read_csv("train.csv")

# print(train_df.count)
# [653061 rows x 3 columns]

# train_df.head()
# df = train_df[(train_df == 1).any(axis=1)]
# print(df['question_text'].tolist())

## Preprocessing

### Dropping the qid

In [5]:
train_df.drop(columns=["qid"],inplace=True)
# train_df.head()

sentences = train_df['question_text'].tolist()

In [6]:
N = 653061
sentences = sentences[0:N]
# print(sentences)

### Cleaning the data

- Like removing !?., etc.
- converting sentences to lower case

In [7]:
i=0
for sentence in sentences:
    temp = re.sub('[^a-zA-Z0-9]', ' ', sentence)
    temp = temp.lower()
    new_sentence = temp.split()
    new_sentence = ' '.join(new_sentence)
    sentences[i] = new_sentence
    # print(new_sentence)
    i+=1

### Lemmatization
- We need to perform Stemming and Lemmatization on the sentences. Lemmatization is prefered as of now (Converting to meaningful words).

In [8]:
lemmatizer = WordNetLemmatizer()

tokenized_sentences = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)

    # removing stop words and using list composition 
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

    # joining words using spaces
    tokenized_sentences.append(' '.join(words))

### Data in sentences

In [10]:
sentences = tokenized_sentences
# print(sentences)

### Trying Methods

#### Bag Of Words

In [11]:
# TODO max_features = 1500 may need to be altered
cv = CountVectorizer(max_features = 1500)
X1 = cv.fit_transform(sentences)

In [12]:
print(type(X1))
print(X1.dtype)
X1 = X1.astype(float)
print(X1.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
int64
float64


In [13]:
Y1 = train_df['target'].to_numpy().astype(np.float64)
Y1 = Y1[:N]

#### TF IDF

In [14]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(sentences)
# print(X2)

In [15]:
print(type(X2))
X2 = X2.astype(float)
print(X2.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
float64


In [16]:
Y2 = Y1

#### Train test split data

In [17]:
from sklearn.model_selection import train_test_split 

train_X1, test_X1, train_y1, test_y1 = train_test_split(X1, Y1, train_size=0.6)

train_X2, test_X2, train_y2, test_y2 = train_test_split(X2, Y2, train_size=0.6)

#### Model generation

In [18]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

#### For data genrated by "Bag of words" method  

In [19]:
lreg1 = LogisticRegression(solver='liblinear')
lreg1.fit(train_X1,train_y1)

LogisticRegression(solver='liblinear')

#### For data generated by "TD IDF" method

In [20]:
lreg2 = LogisticRegression(solver='liblinear')
lreg2.fit(train_X2,train_y2)

LogisticRegression(solver='liblinear')

#### Predict for X1, Y1

In [21]:
train_yhat1 = lreg1.predict_proba(train_X1)
test_yhat1 = lreg1.predict_proba(test_X1)

print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

training score =  0.9111882107128229
test score =  0.9044000150998281


#### Predict for X2, Y2

In [22]:
train_yhat2 = lreg2.predict_proba(train_X2)
test_yhat2 = lreg2.predict_proba(test_X2)

print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

training score =  0.9581803365475136
test score =  0.9398699410054537


In [26]:
data = [["question_text","target"]]
for i in range(N):
  data.append([sentences[i],Y1[i]])

In [27]:
import csv

with open('processed_train_data.csv','w',newline='') as fp:
  a = csv.writer(fp, delimiter=',')
  a.writerows(data)

#### Saving Model

In [30]:
import joblib

joblib.dump(lreg1,'Using Split LReg1 Model')
joblib.dump(lreg2,'Using Split LReg2 Model')

['Using Split LReg2 Model']