# AskReddit Troll Question Detection Challenge

## Imports

In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
train_df = pd.read_csv("processed_train_data.csv")
test_df = pd.read_csv("processed_test_data.csv")

In [3]:
sentences1 = train_df['question_text'].values.astype('U')
sentences2 = test_df['question_text'].values.astype('U')
sentences1 = sentences1.tolist()
sentences2 = sentences2.tolist()
sentences = []
sentences = sentences1 + sentences2
# train_df.head()
# test_df.head()

In [4]:
print(len(sentences1))
print(len(sentences2))
N = len(sentences)
# sentences = sentences[:N]
print(len(sentences))

1218731
653061
1871792


### Trying Methods

#### Bag Of Words

In [5]:
cv = CountVectorizer()
X1 = cv.fit_transform(sentences)

In [6]:
print(type(X1))
print(X1.dtype)
X1 = X1.astype(float)
print(X1.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
int64
float64


In [7]:
Y1 = train_df['target'].to_numpy().astype(np.float64)
Y1 = Y1[:N]

#### TF IDF

In [8]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(sentences)
# print(X2)

In [9]:
print(type(X2))
X2 = X2.astype(float)
print(X2.dtype)

<class 'scipy.sparse.csr.csr_matrix'>
float64


In [10]:
Y2 = Y1

In [11]:
train_X1 = X1[:len(sentences1),:]
train_X2 = X2[:len(sentences1),:]

In [12]:
test_X1 = X1[len(sentences1):,:]
test_X2 = X2[len(sentences1):,:]

#### Train test split data

In [13]:
# from sklearn.model_selection import train_test_split 

# train_X1, test_X1, train_y1, test_y1 = train_test_split(train_X1, Y1, train_size=0.6)

# train_X2, test_X2, train_y2, test_y2 = train_test_split(train_X2, Y2, train_size=0.6)

#### Model generation

In [14]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

#### For data genrated by "Bag of words" method  

In [15]:
lreg1 = LogisticRegression(solver='liblinear')
print(train_X1.shape)
print(Y1.shape)
lreg1.fit(train_X1,Y1)

(1218731, 179866)
(1218731,)


LogisticRegression(solver='liblinear')

#### For data generated by "TD IDF" method

In [16]:
lreg2 = LogisticRegression(solver='liblinear')
lreg2.fit(train_X2,Y2)


LogisticRegression(solver='liblinear')

### Predict for X1, Y1

In [17]:
test_yhat1 = lreg1.predict_proba(test_X1)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

threshold = 0.5

test_output1 = (test_yhat1[:,1] > threshold).astype(int)

# print("training score = ", roc_auc_score(train_y1, train_yhat1[:,1]))
# print("test score = ", roc_auc_score(test_y1,test_yhat1[:,1]))

### Predict for X2, Y2

In [18]:
test_yhat2 = lreg2.predict_proba(test_X2)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

threshold = 0.5

test_output2 = (test_yhat2[:,1] > threshold).astype(int)

# print("training score = ", roc_auc_score(train_y2, train_yhat2[:,1]))
# print("test score = ", roc_auc_score(test_y2,test_yhat2[:,1]))

In [19]:
test_output1 = test_output1.reshape(-1,1)
test_output1 = test_output1.astype(int)
print(test_output1.shape)

(653061, 1)


In [20]:
test_output2 = test_output2.reshape(-1,1)
test_output2 = test_output2.astype(int)
print(test_output2.dtype)

int64


#### Making a submission file

In [21]:
y_pred_df1 = pd.DataFrame(data=test_output1[:,0], columns = ["target"])
submission_df1 = pd.concat([test_df["qid"], y_pred_df1["target"]], axis=1, join='inner')
submission_df1.to_csv("submission1.csv", index = False)
print(submission_df1.shape)

(653061, 2)


In [22]:
y_pred_df2 = pd.DataFrame(data=test_output2[:,0], columns = ["target"])
submission_df2 = pd.concat([test_df["qid"], y_pred_df2["target"]], axis=1, join='inner')
submission_df2.to_csv("submission2.csv", index = False)
print(submission_df2.shape)

(653061, 2)


In [23]:
type(test_output1)

unique, counts = np.unique(test_output1, return_counts=True)

print(np.asarray((unique, counts)).T)

[[     0 568112]
 [     1  84949]]
