In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from functools import reduce
import random
import re
from tqdm import tqdm_notebook as tqdm

In [None]:
DATA_DIR = "../input"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV = f"{DATA_DIR}/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}; cols: {list(train_df.columns)}")
print(f"Test shape: {test_df.shape}; cols: {list(test_df.columns)}")

In [None]:
# randomly show a train example
list(train_df.iloc[random.randint(0, len(train_df))])

### Data analysis

In [None]:
sincere = train_df.loc[train_df['target'] == 0]
insincere = train_df.loc[train_df['target'] == 1]

print(insincere.iloc[random.randint(0, len(insincere))]['question_text'])

print(f"Sincere: {len(sincere)} ({round(100.0 * len(sincere)/len(train_df), 3)}%)")
print(f"Insincere: {len(insincere)} ({round(100.0 * len(insincere)/len(train_df), 3)}%)")

### Naive Bayes


In [None]:
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_df.question_text)
print(train_counts.shape)

model = MultinomialNB(alpha=1)
model.fit(train_counts, train_df.target)

In [None]:
test_counts = count_vect.transform(("this car is a nice car",))
pred = model.predict(test_counts)
print("Insincere" if pred.data[0] == 1 else "Sincere")

In [None]:
test_counts = count_vect.transform(test_df.question_text)
preds = model.predict(test_counts).astype(int)

In [None]:
submission = pd.DataFrame.from_dict({
    'qid': test_df['qid'],
    'prediction': preds
})

submission.to_csv('submission.csv', index=False)