In [None]:
import numpy as np
import pandas as pd

from functools import reduce
from collections import Counter
import random
import re
from tqdm import tqdm_notebook as tqdm

In [None]:
DATA_DIR = "../input"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV = f"{DATA_DIR}/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}; cols: {list(train_df.columns)}")
print(f"Test shape: {test_df.shape}; cols: {list(test_df.columns)}")

In [None]:
# randomly show a train example
list(train_df.iloc[random.randint(0, len(train_df))])

In [None]:
sincere = train_df.loc[train_df['target'] == 0]
insincere = train_df.loc[train_df['target'] == 1]

print(insincere.iloc[random.randint(0, len(insincere))]['question_text'])

print(f"Sincere: {len(sincere)} ({round(100.0 * len(sincere)/len(train_df), 3)}%)")
print(f"Insincere: {len(insincere)} ({round(100.0 * len(insincere)/len(train_df), 3)}%)")

In [None]:
def tokenize(s: str):
    xs = list(map(lambda w : w.lower().split(" "), re.findall(r"[\w']+", s)))
    xs = set(reduce(lambda x, y : x + y, xs, []))
    
    return xs

In [None]:
def train(df):
    # P_sincere[word] = p(sincere | word)
    p_sincere = Counter()
    # P_insincere[word] = p(insincere | word)
    p_insincere = Counter()

    for (_, question, t) in tqdm(train_df[['question_text', 'target']].itertuples(), total=len(train_df)):
        for w in tokenize(question):            
            p_sincere[w]   += (t == 0) * 1
            p_insincere[w] += (t == 1) * 1
    
    return p_sincere, p_insincere

In [None]:
p_sincere, p_insincere = train(train_df)
num_sincere_words = sum(p_sincere.values())
num_insincere_words = sum(p_insincere.values())
voc_len = len(set(p_sincere.keys()).union(set(p_insincere.keys())))

In [None]:
def predict(question, alpha=1):
    log_s, log_i = np.log(0.5), np.log(0.5)
    
    s_factor = 1.0 / (num_sincere_words + voc_len * alpha)
    i_factor = 1.0 / (num_insincere_words + voc_len * alpha)
    
    for w in tokenize(question):
        if w in p_sincere:
            log_s += np.log(s_factor * (p_sincere[w] + alpha))
            
        if w in p_insincere:
            log_i += np.log(i_factor * (p_insincere[w] + alpha))
        
    return (1, log_i) if log_i >= log_s else (0, log_s)

In [None]:
predict("Why do Americans have an average IQ of 78 wheres I have an IQ of 159?")

In [None]:
preds = []

for (_, qid, qtext) in tqdm(test_df.itertuples(), total=len(test_df)):
    p, _ = predict(qtext, alpha=1)
    preds.append(p)

submission = pd.DataFrame.from_dict({
    'qid': test_df['qid'],
    'prediction': preds
})
submission.to_csv('submission.csv', index=False)