In [None]:
import numpy as np
import pandas as pd

from functools import reduce
import random
import re
from tqdm import tqdm_notebook as tqdm

In [None]:
DATA_DIR = "../input"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV = f"{DATA_DIR}/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}; cols: {list(train_df.columns)}")
print(f"Test shape: {test_df.shape}; cols: {list(test_df.columns)}")

In [None]:
# randomly show a train example
list(train_df.iloc[random.randint(0, len(train_df))])

In [None]:
sincere = train_df.loc[train_df['target'] == 0]
insincere = train_df.loc[train_df['target'] == 1]

print(insincere.iloc[random.randint(0, len(insincere))]['question_text'])

print(f"Sincere: {len(sincere)} ({round(100.0 * len(sincere)/len(train_df), 3)}%)")
print(f"Insincere: {len(insincere)} ({round(100.0 * len(insincere)/len(train_df), 3)}%)")

In [None]:
CONTRACTIONS = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [None]:
def expand_contraction(w):
    w = w.strip().lower().replace(" ", "")
    return CONTRACTIONS.get(w, w)

def tokenize(s: str):
    xs = list(map(lambda w : expand_contraction(w.lower()).split(" "), re.findall(r"[-\w']+", s)))
    xs = set(reduce(lambda x, y : x + y, xs, []))
    
    return xs

print(expand_contraction("yOu\'Ve"))
print(tokenize("Thank you're the man, my man, thank you!"))

In [None]:
def train(df):
    # voc[word] = (sincere == 0, insincere == 1) 
    voc = {}
    sincere_words_no = 0
    insincere_words_no = 0

    for (_, q, t) in tqdm(train_df[['question_text', 'target']].itertuples()):
        for w in tokenize(q):
            sincere_words_no += (t == 0) * 1 
            insincere_words_no += (t == 1) * 1
            
            if w not in voc:
                voc[w] = (int(t == 0), int(t == 1))
            else:
                s, i = voc[w]
                voc[w] = (s + int(t == 0), i + int(t == 1))
    
    return voc, sincere_words_no, insincere_words_no

In [None]:
voc, sincere_words_no, insincere_words_no = train(train_df)
print(len(voc), sincere_words_no, insincere_words_no)

In [None]:
def predict(params, question, alpha=1):
    voc, sincere_words_no, insincere_words_no = params
    
    log_s, log_i = np.log(0.5), np.log(0.5)
    
    s_ = (sincere_words_no + len(voc) * alpha)
    i_ = (insincere_words_no + len(voc) * alpha)
    
    for w in tokenize(question):
        if w in voc:
            s, i = voc[w]
            log_s += np.log(1.0 * (s + alpha) / s_)
            log_i += np.log(1.0 * (i + alpha) / i_)
        
    return (1, log_i) if log_i >= log_s else (0, log_s)
    
params = (voc, sincere_words_no, insincere_words_no)
predict(params, question="Why do Americans have an average IQ of 78 wheres I have an IQ of 159?")

In [None]:
preds = []

for (_, qid, qtext) in tqdm(test_df.itertuples()):
    p = predict(params, qtext, alpha=1)
    preds.append(int(p[0]))

submission = pd.DataFrame.from_dict({
    'qid': test_df['qid'],
    'prediction': preds
})
submission.to_csv('submission.csv', index=False)