In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

# Reading Data

In [3]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')
sub = pd.read_excel('Sample_submission.xlsx')

train.shape, test.shape, sub.shape

((7628, 2), (2748, 1), (2748, 1))

# Data PreProcessing

In [4]:
import re
import itertools
def remove_noise(text):
    # Make lowercase
    text = text.apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    # Remove whitespaces
    text = text.apply(lambda x: " ".join(x.strip() for x in x.split()))
    
    # Convert to string
    text = text.astype(str)
        
    return text

In [5]:
train['STORY'] = remove_noise(train['STORY'])
test['STORY'] = remove_noise(test['STORY'])

# Feature Engineering

In [6]:
from nltk.tokenize import WordPunctTokenizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tt = WordPunctTokenizer()

In [9]:
cv = CountVectorizer(tokenizer=tt.tokenize, stop_words='english', ngram_range=(1,3))

cv.fit(train.STORY.tolist() + test.STORY.tolist())
train_cv = cv.transform(train.STORY)
test_cv = cv.transform(test.STORY)

In [10]:
tf_idf = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1.0,smooth_idf=1.0,sublinear_tf=1.0,
            stop_words = 'english')

tf_idf.fit(train.STORY.tolist() + test.STORY.tolist())
train_tf = tf_idf.transform(train.STORY)
test_tf = tf_idf.transform(test.STORY)

# Modeling

In [11]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC

In [12]:
%%time
best_params = {'C': 0.9, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2'}
m = LogisticRegression(**best_params, class_weight='balanced')
m.fit(train_cv, train.SECTION)

best_params = {'alpha': 0.5}
nb = MultinomialNB(**best_params)
nb.fit(train_cv, train.SECTION)

best_params = {'alpha': 5e-08}
nb2 = BernoulliNB(**best_params)
nb2.fit(train_cv, train.SECTION)

best_params = {'C': 250, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2'}
m_tf = LogisticRegression(**best_params, class_weight='balanced')
m_tf.fit(train_tf, train.SECTION)

best_params = {'alpha': 0.06}
nb_tf = MultinomialNB(**best_params)
nb_tf.fit(train_tf, train.SECTION)

best_params = {'kernel': 'linear', 'C': 1}
svc_tf = SVC(**best_params, class_weight='balanced', probability=True)
svc_tf.fit(train_tf, train.SECTION)



CPU times: user 2min, sys: 6.17 s, total: 2min 6s
Wall time: 1min 29s


# Ensembing

In [13]:
from scipy.stats.mstats import mode

pred1 = m.predict_proba(test_cv)
pred2 = nb.predict_proba(test_cv)
pred3 = nb2.predict_proba(test_cv)
pred4 = m_tf.predict_proba(test_tf)
pred5 = nb_tf.predict_proba(test_tf)
pred6 = svc_tf.predict_proba(test_tf)

test_pred_prob = np.mean([pred1, pred2, pred3, pred4, pred5, pred6], axis=0)
pred = np.argmax(test_pred_prob, axis=1)
# from sklearn.metrics import accuracy_score
# accuracy_score(test.SECTION, pred)

In [14]:
submission = pd.DataFrame()
submission['SECTION'] = pred.astype(int)
submission.to_excel("ml_prob.xlsx", index=None)
submission.head()

Unnamed: 0,SECTION
0,1
1,2
2,1
3,0
4,1
