Baseline for the HAHA [challenge 2019](https://competitions.codalab.org/competitions/22194)  After you register for the challenge, you will get access to the training data titled `haha_2019_train.csv`

Based on [NB-SVM baseline](https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline)


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pdb import set_trace

### Reading the texts

In [3]:
all_texts_df = pd.read_csv('/home/farzin/rnn_python_code/tweet_es_finetune/haha_2019_train.csv')

In [4]:
all_texts_df.head()

Unnamed: 0,id,text,is_humor,votes_no,votes_1,votes_2,votes_3,votes_4,votes_5,funniness_average
0,705196579758583809,Niveles de retraso mental: \r\n\r\n— Bajo.\r\n...,1,1,2,2,0,0,0,1.5
1,678040651817213952,"—Vamos Luke desenfunda tu sable, demuestra tu ...",1,1,3,0,1,0,0,1.5
2,546750892213829633,"- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...",1,0,2,1,0,1,1,2.6
3,965807211292364801,No se porqué me hago la cabeza deooos,0,3,0,0,0,0,0,
4,638403841839484928,Quisiera saber que hago durante la siesta de l...,0,4,0,1,0,0,0,


In [5]:
rnd_seed = 20190313
np.random.seed(rnd_seed)

idx = np.random.permutation(len(all_texts_df))
test_cut = int(0.15 * len(idx))
valid_cut = int(0.15 * len(idx-test_cut))

df_train = all_texts_df.iloc[idx[:-(valid_cut+test_cut)],:]
df_test  = all_texts_df.iloc[idx[-(valid_cut+test_cut):-test_cut],:]
# df_test      = all_texts_df.iloc[idx[-test_cut:],:]

In [6]:
df_train.shape, df_test.shape

((16800, 10), (3600, 10))

In [7]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [8]:
n = all_texts_df.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(df_train['text'])
test_term_doc = vec.transform(df_test['text'])

In [9]:
trn_term_doc, test_term_doc

(<16800x22072 sparse matrix of type '<class 'numpy.float64'>'
 	with 388461 stored elements in Compressed Sparse Row format>,
 <3600x22072 sparse matrix of type '<class 'numpy.float64'>'
 	with 79895 stored elements in Compressed Sparse Row format>)

In [10]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [11]:
x = trn_term_doc
test_x = test_term_doc

In [12]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [13]:
label_cols = ['is_humor']
preds = np.zeros((len(df_test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(df_train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit is_humor




In [14]:
TP = ((preds > 0.5).astype(int) == df_test[label_cols].values)

In [15]:
f'Accuracy: {TP.sum() / len(df_test)}'

'Accuracy: 0.8391666666666666'