# Data

In [7]:
!head -3 products_sentiment_train.tsv

2 . take around 10,000 640x480 pictures .	1
i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .	1
the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .	1


In [8]:
!wc -l products_sentiment_train.tsv

2000 products_sentiment_train.tsv


In [4]:
!head -3 products_sentiment_test.tsv

Id	text
0	so , why the small digital elph , rather than one of the other cameras with better resolution or picture quality ? size [ + 2 ] # # because , unless it 's small , i won 't cary it around .
1	3/4 way through the first disk we played on it ( naturally on 31 days after purchase ) the dvd player froze . 


In [9]:
!wc -l products_sentiment_test.tsv

501 products_sentiment_test.tsv


In [11]:
!head -5 products_sentiment_sample_submission.csv

Id,y
0,0
1,1
2,0
3,1


In [1]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv("products_sentiment_train.tsv", sep="\t", header=None)

In [6]:
data.columns = ["text", "target"]

In [7]:
data.head(2)

Unnamed: 0,text,target
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1


In [8]:
data.shape

(2000, 2)

In [9]:
data.target.value_counts()

1    1274
0     726
Name: target, dtype: int64

# Baseline

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [153]:
p = Pipeline([
    ("txt", TfidfVectorizer(ngram_range=(3, 5), analyzer="char_wb")),
    ("clf", LogisticRegression(class_weight="balanced"))
])

In [154]:
cross_val_score(p, data.text, data.target, cv=cv, n_jobs=-1).mean()

0.78950317814486337

# Regex

In [77]:
!pip install regex

Collecting regex
  Downloading regex-2017.09.23.tar.gz (607kB)
[K    100% |████████████████████████████████| 614kB 375kB/s ta 0:00:01
[?25hBuilding wheels for collected packages: regex
  Running setup.py bdist_wheel for regex ... [?25l- \ | / - \ | / done
[?25h  Stored in directory: /home/sergey/.cache/pip/wheels/85/91/3e/66b0bd43d75bdc4a26fbe816e523ffc815fa8990117bdfc3b3
Successfully built regex
Installing collected packages: regex
Successfully installed regex-2017.9.23


In [78]:
import regex as re

In [87]:
re.sub(ur"([A-Za-z]{2,})|(\p{P}{2,})", "dog", "cat . :)")

'dog . dog'

In [125]:
pattern = ur"[A-Za-z]{2,}|\p{P}{2,}"

In [126]:
regex_pattern = re.compile(pattern)

In [128]:
regex_pattern.findall("cat . :) dog . . :-(")

['cat', ':)', 'dog', ':-(']

In [132]:
def regex_tokenizer(text):
    return regex_pattern.findall(text)

In [159]:
p = Pipeline([
    ("txt", TfidfVectorizer(tokenizer=regex_tokenizer, ngram_range=(1, 4))),
    ("clf", LogisticRegression(class_weight="balanced"))
])

In [160]:
cross_val_score(p, data.text, data.target, cv=cv, n_jobs=-1).mean()

0.78052696266851673

# Training best model

In [161]:
p = Pipeline([
    ("txt", TfidfVectorizer(ngram_range=(3, 5), analyzer="char_wb")),
    ("clf", LogisticRegression(class_weight="balanced"))
])

In [162]:
cross_val_score(p, data.text, data.target, cv=cv, n_jobs=-1).mean()

0.78950317814486337

In [163]:
p.fit(data.text, data.target)

Pipeline(memory=None,
     steps=[('txt', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 5), norm=u'l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [168]:
test = pd.read_csv("products_sentiment_test.tsv", sep="\t", index_col="Id")

In [169]:
test.head(2)

Unnamed: 0_level_0,text
Id,Unnamed: 1_level_1
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...


In [170]:
test["y"] = p.predict(test.text)

In [172]:
test.head(2)

Unnamed: 0_level_0,text,y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"so , why the small digital elph , rather than ...",1
1,3/4 way through the first disk we played on it...,0


In [173]:
test[["y"]].to_csv("tf-idf-3-5-lr.csv")

In [175]:
!head -4 tf-idf-3-5-lr.csv

Id,y
0,1
1,0
2,1


# Regex + CountVectorizer

In [180]:
p = Pipeline([
    ("txt", CountVectorizer(tokenizer=regex_tokenizer, ngram_range=(1, 2))),
    ("clf", LogisticRegression(class_weight="balanced"))
])

In [181]:
cross_val_score(p, data.text, data.target, cv=cv, n_jobs=-1).mean()

0.7730006593791211

# Words + Chars

In [187]:
tf_idf_chars = TfidfVectorizer(ngram_range=(3, 5), analyzer="char_wb")

In [188]:
tf_idf_words = TfidfVectorizer(ngram_range=(1, 4))

In [189]:
chars = tf_idf_chars.fit_transform(data.text)

In [190]:
words = tf_idf_words.fit_transform(data.text)

In [191]:
import scipy

In [192]:
X = scipy.sparse.hstack([chars, words])

In [193]:
lr = LogisticRegression(class_weight="balanced")

In [194]:
cross_val_score(lr, X, data.target, cv=cv, n_jobs=-1).mean()

0.79500945005906287

In [199]:
lr.fit(X, data.target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [195]:
X_test = scipy.sparse.hstack([tf_idf_chars.transform(test.text), tf_idf_words.transform(test.text)])

In [200]:
test["y"] = lr.predict(X_test)

In [201]:
test.head(2)

Unnamed: 0_level_0,text,y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"so , why the small digital elph , rather than ...",1
1,3/4 way through the first disk we played on it...,0


In [202]:
test[["y"]].to_csv("tf-idf-chars-words-lr.csv")

# Most important words

In [203]:
p = Pipeline([
    ("txt", tf_idf_words),
    ("clf", LogisticRegression(class_weight="balanced", penalty="l1"))
])

In [204]:
p.fit(data.text, data.target)

Pipeline(memory=None,
     steps=[('txt', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm=u'l2', preprocessor=None, smooth_idf=True,...ty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [212]:
important_words = p._final_estimator.coef_[0] != 0

In [213]:
sum(important_words)

34

In [215]:
np.array(tf_idf_words.get_feature_names())[important_words]

array([u'after', u'all', u'and', u'annoying', u'best', u'but', u'buttons',
       u'camera', u'easy', u'excellent', u'features', u'get', u'good',
       u'great', u'love', u'norton', u'not', u'only', u'phone', u'price',
       u'problem', u'quality', u'small', u'they', u'this', u'to', u'use',
       u'very', u'was', u'when', u'with', u'works', u'would', u'your'], 
      dtype='<U42')

In [224]:
stop_words = np.array(tf_idf_words.get_feature_names())[~important_words]

In [225]:
stop_words[:5]

array([u'00', u'00 and', u'00 and they', u'00 and they work', u'00 for'], 
      dtype='<U42')

# Most important chars

In [217]:
p = Pipeline([
    ("txt", tf_idf_chars),
    ("clf", LogisticRegression(class_weight="balanced", penalty="l1"))
])

In [218]:
p.fit(data.text, data.target)

Pipeline(memory=None,
     steps=[('txt', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 5), norm=u'l2', preprocessor=None, smooth_idf=Tru...ty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [219]:
important_chars = p._final_estimator.coef_[0] != 0

In [221]:
sum(important_chars)

86

In [222]:
np.array(tf_idf_chars.get_feature_names())[important_chars]

array([u' ! ', u' " ', u' aft', u' afte', u' am', u' an', u' but', u' di',
       u' dis', u' do', u' easy', u' ex', u' fa', u' get', u' ha', u' i ',
       u' lov', u' love', u' no', u' ph', u' pro', u' sc', u' st', u' to',
       u' un', u' use ', u' wou', u' woul', u"'t ", u'ack ', u'all ',
       u'and', u'ard', u'ay ', u'cel', u'cra', u'eas', u'eat', u'eat ',
       u'ell', u'em ', u'erf', u'ery', u'exc', u'has', u'hen', u'hey',
       u'hey ', u'igh', u'ime', u'is ', u'lent', u'lov', u'love', u'mes',
       u'ng ', u'nly ', u'not ', u'nte', u'ome', u'onl', u'ood', u'ort',
       u'ot ', u'ould ', u'ove', u'per', u'pho', u'pri', u'ra ', u'sel',
       u'sta', u'th ', u'the', u'they', u'they ', u'time', u'tiv', u'ton',
       u'ture', u'uld ', u'ust', u'ver', u'wou', u'woul', u'would'], 
      dtype='<U5')

In [226]:
stop_chars = np.array(tf_idf_chars.get_feature_names())[~important_chars]

In [227]:
stop_chars[:5]

array([u' # ', u' $ ', u' % ', u' & ', u' &#'], 
      dtype='<U5')

# Leave only most important words and chars

In [232]:
lr = LogisticRegression(class_weight="balanced")

In [235]:
X_important = scipy.sparse.hstack([chars[:, important_chars], words[:, important_words]])

In [236]:
cross_val_score(lr, X_important, data.target, cv=cv, n_jobs=-1).mean()

0.7600056531603322

In [237]:
lr.fit(X_important, data.target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [238]:
X_test_important = scipy.sparse.hstack([
    tf_idf_chars.transform(test.text)[:, important_chars],
    tf_idf_words.transform(test.text)[:, important_words]
])

In [239]:
test["y"] = lr.predict(X_test_important)

In [240]:
test.head(2)

Unnamed: 0_level_0,text,y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"so , why the small digital elph , rather than ...",1
1,3/4 way through the first disk we played on it...,0


In [241]:
test[["y"]].to_csv("important-chars-words-lr.csv")


# Leave chars and only most important words

In [232]:
lr = LogisticRegression(class_weight="balanced")

In [242]:
X_important_w = scipy.sparse.hstack([chars, words[:, important_words]])

In [243]:
cross_val_score(lr, X_important_w, data.target, cv=cv, n_jobs=-1).mean()

0.78750067187919925

# Leave words and only most important chars

In [244]:
lr = LogisticRegression(class_weight="balanced")

In [245]:
X_important_ch = scipy.sparse.hstack([chars[:, important_chars], words])

In [246]:
cross_val_score(lr, X_important_ch, data.target, cv=cv, n_jobs=-1).mean()

0.79401070944193408

# Random Forest

In [247]:
from sklearn.ensemble import RandomForestClassifier

In [262]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_leaf=1, random_state=0)

In [263]:
%%time
cross_val_score(rf, X_important, data.target, cv=cv, n_jobs=-1).mean()

CPU times: user 117 ms, sys: 30.3 ms, total: 147 ms
Wall time: 2.32 s


0.75150438127738295

# Neural networks

In [264]:
from sklearn.neural_network import MLPClassifier

In [292]:
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(10,), max_iter=1000, activation="tanh")

In [293]:
%%time
cross_val_score(mlp, X_important, data.target, cv=cv, n_jobs=-1).mean()

CPU times: user 96.8 ms, sys: 47.7 ms, total: 145 ms
Wall time: 6.21 s


0.7895082031762698

# Summary

The following approaches were tried:

* CountVectorizer
* TfidfVectorizer
* ngrams
* tokenizer yielding both words and punctuation sequencies like ":)", ":(", etc
* combining TfidfVectorizer for words and chars
* filtering out not important words and chars using LogisticRegression with l1 penalty
* logistic regression on most important words and chars
* random forest on most important words and chars
* simple neural network on most important words and chars

**Best model**: [Words + Chars](#Words-+-Chars)