# AskReddit Troll Question Detection Challenge

## Imports

In [5]:
import numpy as np 
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [6]:

import nltk # for tokenizing the paragraphs in sentences and sentences in words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/archit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/archit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/archit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
test_df = pd.read_csv("test.csv")

# print(test_df.count)
# [653061 rows x 3 columns]

qid = test_df['qid']

test_df.head()
test_df.shape
# df = test_df[(test_df == 1).any(axis=1)]
# print(df['question_text'].tolist())
# print(type(qid))

(653061, 2)

## Preprocessing

### Dropping the qid

In [14]:
test_df.drop(columns=["qid"],inplace=True)
# test_df.head()

sentences = test_df['question_text'].tolist()

In [15]:
N = 653061
sentences = sentences[0:N]
# print(sentences)

### Cleaning the data

- Like removing !?., etc.
- converting sentences to lower case

In [16]:
i=0
for sentence in sentences:
    temp = re.sub('[^a-zA-Z0-9]', ' ', sentence)
    temp = temp.lower()
    new_sentence = temp.split()
    new_sentence = ' '.join(new_sentence)
    sentences[i] = new_sentence
    # print(new_sentence)
    i+=1

### Lemmatization
- We need to perform Stemming and Lemmatization on the sentences. Lemmatization is prefered as of now (Converting to meaningful words).

In [17]:
lemmatizer = WordNetLemmatizer()

tokenized_sentences = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)

    # removing stop words and using list composition 
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

    # joining words using spaces
    tokenized_sentences.append(' '.join(words))

### Data in sentences

In [18]:
sentences = tokenized_sentences
# print(sentences)

### Trying Methods

#### Bag Of Words

In [19]:
# TODO max_features = 1500 may need to be altered
cv = CountVectorizer()
X1 = cv.fit_transform(sentences)

In [20]:
print(type(X1))
print(X1.dtype)
X1 = X1.astype(float)
print(X1.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
int64
float64


#### TF IDF

In [21]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(sentences)
# print(X2)

In [22]:
print(type(X2))
X2 = X2.astype(float)
print(X2.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
float64


#### Importing Model and Prediction

In [23]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib

#### For data genrated by "Bag of words" method  

In [24]:
lreg1 = joblib.load('Using Split LReg5 Model')

#### For data generated by "TD IDF" method

In [25]:
lreg2 = joblib.load('Using Split LReg6 Model')

#### Predict for X1, Y1

In [26]:
test_yhat1 = lreg1.predict_proba(X1)

ValueError: X has 123404 features, but LogisticRegression is expecting 124033 features as input.

#### Predict for X2, Y2

In [None]:
test_yhat2 = lreg2.predict_proba(X2)

#### Generating CSV Files

In [None]:
y_pred_df1 = pd.DataFrame(data=test_yhat1[:,1], columns = ["target"])
# print(y_pred_df)

submission_df1 = pd.concat([qid, y_pred_df1["target"]], axis=1, join='inner')
submission_df1.to_csv("submission1.csv", index = False)
print(submission_df1.shape)

In [None]:
y_pred_df2 = pd.DataFrame(data=test_yhat2[:,1], columns = ["target"])
# print(y_pred_df)

submission_df2 = pd.concat([qid, y_pred_df2["target"]], axis=1, join='inner')
submission_df2.to_csv("submission2.csv", index = False)
print(submission_df2.shape)

In [None]:
data = [["qid","question_text"]]
for i in range(N):
  data.append([qid[i],sentences[i]])

In [None]:
# import csv

# with open('processed_train_data.csv','w',newline='') as fp:
#   a = csv.writer(fp, delimiter=',')
#   a.writerows(data)

#### Saving Model

In [None]:
# import joblib

# joblib.dump(lreg1,'Using Split LReg1 Model')
# joblib.dump(lreg2,'Using Split LReg2 Model')