In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from functools import reduce
import random
import re
from tqdm import tqdm_notebook as tqdm

In [2]:
DATA_DIR = "../input"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV = f"{DATA_DIR}/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}; cols: {list(train_df.columns)}")
print(f"Test shape: {test_df.shape}; cols: {list(test_df.columns)}")

Train shape: (1306122, 3); cols: ['qid', 'question_text', 'target']
Test shape: (375806, 2); cols: ['qid', 'question_text']


In [3]:
# randomly show a train example
list(train_df.iloc[random.randint(0, len(train_df))])

['f2613e92c3832ca1dee7',
 'What is the language of Kerala, as "Malayalam" (Malai + Aalam) simply denotes the region?',
 0]

### Data analysis

In [4]:
sincere = train_df.loc[train_df['target'] == 0]
insincere = train_df.loc[train_df['target'] == 1]

print(insincere.iloc[random.randint(0, len(insincere))]['question_text'])

print(f"Sincere: {len(sincere)} ({round(100.0 * len(sincere)/len(train_df), 3)}%)")
print(f"Insincere: {len(insincere)} ({round(100.0 * len(insincere)/len(train_df), 3)}%)")

What are the best examples of "CM Modi trolling PM Modi"?
Sincere: 1225312 (93.813%)
Insincere: 80810 (6.187%)


### Naive Bayes


In [5]:
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_df.question_text)
print(train_counts.shape)

model = MultinomialNB(alpha=1)
model.fit(train_counts, train_df.target)

(1306122, 195000)


MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [44]:
test_counts = count_vect.transform(("this car is a nice car",))
pred = model.predict(test_counts)
print("Insincere" if pred.data[0] == 1 else "Sincere")

Sincere


In [45]:
test_counts = count_vect.transform(test_df.question_text)
preds = model.predict(test_counts).astype(int)

In [None]:
submission = pd.DataFrame.from_dict({
    'qid': test_df['qid'],
    'prediction': preds
})

submission.to_csv('submission.csv', index=False)