In [80]:
import re
import os

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.metrics import f1_score

In [2]:
data_path = '../data'

In [3]:
d_train = pd.read_csv(os.path.join(data_path, 'raw', 'train.csv'))
d_test = pd.read_csv(os.path.join(data_path, 'raw', 'test.csv'))

## cleansing

In [41]:
d_train = d_train.sample(frac=1)

lower string

In [6]:
d_train.text = d_train.text.str.lower()

In [8]:
d_test.text = d_test.text.str.lower()

tokenize

In [11]:
d_train['token'] = d_train.text.apply(word_tokenize)
d_test['token'] = d_test.text.apply(word_tokenize)

remove chars

In [16]:
d_train['token_cleansed']= d_train.token.apply(lambda x: [c for c in x if len(c) > 1])

In [17]:
d_test['token_cleansed'] = d_test.token.apply(lambda x: [c for c in x if len(c) > 1])

In [21]:
d_train['text_cleansed'] = d_train.token_cleansed.apply(lambda x: " ".join(x))
d_test['text_cleansed'] = d_test.token_cleansed.apply(lambda x: " ".join(x))

## Feature Extraction

In [43]:
tfidf = TfidfVectorizer()

In [44]:
X = tfidf.fit_transform(d_train.text_cleansed)

In [45]:
X_submission = tfidf.transform(d_test.text_cleansed)

## Modeling

train test split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, d_train.target, test_size = 0.2)

In [47]:
bnb = BernoulliNB()

In [48]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [49]:
y_pred = bnb.predict(X_test)

In [50]:
f1_score(y_test, y_pred)

0.7360655737704918

cross validation

In [51]:
bnb = BernoulliNB()

In [53]:
cross_val_score(bnb, X, d_train.target, cv=10, scoring='f1').mean()

0.7445924807584318

In [55]:
mnb = MultinomialNB()

In [57]:
cross_val_score(mnb, X, d_train.target, cv=10, scoring='f1').mean()

0.7308067057039638

In [60]:
cnb = ComplementNB()

cross_val_score(cnb, X, d_train.target, cv=10, scoring='f1').mean()

In [63]:
svc = SVC(kernel='linear')

In [64]:
cross_val_score(svc, X, d_train.target, cv=10, scoring='f1').mean()

0.7552648578004214

In [83]:
rf = RandomForestClassifier(n_estimators=100)

In [84]:
cross_val_score(rf, X, d_train.target, cv=10, scoring='f1').mean()

0.697873822975375

## Tunning

In [78]:
for alpha in [0.1, 0.3, 0.7, 1.0, 1.3, 1.5, 2.0]:
    cnb = ComplementNB(alpha=alpha)

    f1_score = cross_val_score(cnb, X, d_train.target, cv=5, scoring='f1')
    print("alpha", alpha, "mean", f1_score.mean(), "std", f1_score.std())

alpha 0.1 mean 0.746766333631032 std 0.01529396426665522
alpha 0.3 mean 0.7543237532604767 std 0.017037265406951226
alpha 0.7 mean 0.7516523891572081 std 0.017346939514143043
alpha 1.0 mean 0.7449874091936965 std 0.021028852674571007
alpha 1.3 mean 0.7404058723205704 std 0.019776189976999446
alpha 1.5 mean 0.7385599604731288 std 0.01965776236033056
alpha 2.0 mean 0.7357031932533911 std 0.019534015496908776


In [79]:
for alpha in [0.1, 0.3, 0.7, 1.0, 1.3, 1.5, 2.0]:
    svc = SVC(kernel="linear", C = alpha)

    f1_score = cross_val_score(svc, X, d_train.target, cv=5, scoring='f1')
    print("alpha", alpha, "mean", f1_score.mean(), "std", f1_score.std())

alpha 0.1 mean 0.5940095433308376 std 0.01827678196795304
alpha 0.3 mean 0.7204393360248346 std 0.02259010807627357
alpha 0.7 mean 0.7448064840557631 std 0.01438021016421698
alpha 1.0 mean 0.7499821521853842 std 0.01530601649430986
alpha 1.3 mean 0.7491744008392811 std 0.012441658887757813
alpha 1.5 mean 0.748672808382809 std 0.012735083904404806
alpha 2.0 mean 0.7480481264455092 std 0.012966729797479879


## Submission

In [66]:
cnb = ComplementNB()

In [67]:
cnb.fit(X, d_train.target)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [68]:
y_submission = cnb.predict(X_submission)

In [69]:
d_test['target'] = y_submission

In [71]:
d_test.target.value_counts()

0    2125
1    1138
Name: target, dtype: int64

In [72]:
d_test[["id", "target"]].to_csv(os.path.join(data_path, 'submission', 'submission.csv'), index = False)