# AskReddit Troll Question Detection Challenge

## Imports

In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
train_df = pd.read_csv("processed_train_data.csv")

In [3]:
sentences = train_df['question_text'].values.astype('U')
sentences = sentences.tolist()
train_df.head()

Unnamed: 0,question_text,target
0,role lua civ4,0.0
1,important chapter kannada 10 icse 2018,0.0
2,musician get royalty youtube,0.0
3,difference scaling social enterprise social fr...,0.0
4,elevator go super slow right door open,0.0


In [4]:
N = 653061
sentences = sentences[:N]

### Trying Methods

#### Bag Of Words

In [5]:
cv = CountVectorizer()
X1 = cv.fit_transform(sentences)

In [6]:
print(type(X1))
print(X1.dtype)
X1 = X1.astype(float)
print(X1.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
int64
float64


In [7]:
Y1 = train_df['target'].to_numpy().astype(np.float64)
Y1 = Y1[:N]

#### TF IDF

In [8]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(sentences)
# print(X2)

In [9]:
print(type(X2))
X2 = X2.astype(float)
print(X2.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
float64


In [10]:
Y2 = Y1

#### Model generation

In [11]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

#### For data genrated by "Bag of words" method  

In [12]:
lreg1 = LogisticRegression(solver='liblinear')
lreg1.fit(X1,Y1)

LogisticRegression(solver='liblinear')

#### For data generated by "TD IDF" method

In [13]:
lreg2 = LogisticRegression(solver='liblinear')
lreg2.fit(X2,Y2)


LogisticRegression(solver='liblinear')

#### Predict for X1, Y1

In [14]:
# train_yhat1 = lreg1.predict_proba(train_X1)
# test_yhat1 = lreg1.predict_proba(test_X1)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

#### Predict for X2, Y2

In [15]:
# train_yhat2 = lreg2.predict_proba(train_X2)
# test_yhat2 = lreg2.predict_proba(test_X2)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

#### Saving Model

In [16]:
import joblib

joblib.dump(lreg1,'Using Split LReg5 Model')
joblib.dump(lreg2,'Using Split LReg6 Model')

['Using Split LReg6 Model']