# Import Packages

In [9]:
import pandas as pd
import numpy as np
import sentencepiece as spm
from sudachipy import tokenizer
from sudachipy import dictionary
import MeCab
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Rakuten Binary Classification

Train sample: 3.400.000  
Test sample: 400.000  
Tokenizer: SentencePiece 32k model, trained on random 340.000 samples from the Training set  
Vectorizer: TFIDF  
Classifier: Logistics Regression, C=10, solver=lbfgs, penalty=l2

In [2]:
df_train = pd.read_csv('../data/rakuten-sentiment-dataset/binary/binary_train.csv', header=None)
df_test = pd.read_csv('../data/rakuten-sentiment-dataset/binary/binary_test.csv', header=None)

In [3]:
df_train.head()

Unnamed: 0,0,1,2
0,1,^^,いいです＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾
1,1,×,××××××××××××××××××××××××××××××××××××××××××××××...
2,1,,ｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｋ
3,2,-,いいいいいいいいいいいいいいいいいいいいいいいいいいいいいいぞ
4,1,!!!まいった！,そうりょうむりょうじゃないーーーーーー！やられた！


In [4]:
print("Total training samples: ", len(df_train))
print("Total testing samples: ", len(df_test))

Total training samples:  3400000
Total testing samples:  400000


In [5]:
sp = spm.SentencePieceProcessor(model_file='./sp-model-32000-340k.model')

def tokenize(text):
    tokenized = sp.encode(text, out_type=str)
    return tokenized

In [6]:
X_train, y_train = df_train[2], df_train[0]
print("Total train samples: ", len(X_train))

Total train samples:  3400000


## TFIDF Vectorizer

In [8]:
tfidfVect = TfidfVectorizer(tokenizer=tokenize)

In [10]:
start = time.time()
X_train_tfidf = tfidfVect.fit_transform(X_train)
end = time.time()
print("TFIDF Vect time (SentencePiece): ", end-start)

TFIDF Vect time (SentencePiece):  223.09809064865112


In [None]:
pickle.dump(X_train_tfidf, open("tfidf_features.pickle", "wb"))
pickle.dump(tfidfVect, open("tfidf_vectorizer.pickle", "wb"))

## Load Vectorizer from Pickle

In [7]:
tfidfVect = pickle.load(open("tfidf_vectorizer.pickle", "rb"))
X_train_tfidf = pickle.load(open("tfidf_features.pickle", "rb"))

## Train and Evaluate LR Model

In [8]:
start = time.time()
clf = LogisticRegression(random_state=0, C=10, penalty='l2', solver='lbfgs').fit(X_train_tfidf, y_train)
end = time.time()
print("Training time (LR-SP): ", end-start)

Training time (LR-SP):  80.27098107337952


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Some tips to try (out of many) that might help the algorithm to converge are:
- Increase the Number of Iterations: 
- Try Different Optimizer https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions/52388406#52388406
- Scale Your Data https://scikit-learn.org/stable/modules/preprocessing.html
- Add Engineered Features https://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/
- Data Pre-processing 
    - https://datascience.stackexchange.com/questions/80421/very-low-cross-val-score-for-regression-with-big-corr-between-feature-and-res/80422#80422
    - https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114
- Add More Data https://www.quora.com/How-do-you-determine-sample-size-for-machine-learning-classification/answer/Yahya-Almardeny

In [17]:
print("Total train samples: ", len(X_train))

predicted = clf.predict(X_train_tfidf)
print("Train Error rate (LR-SP)", (1-np.mean(predicted == y_train))*100)
print(classification_report(predicted, y_train, labels=[1,2]))

Total train samples:  3400000
Train Error rate (LR-SP) 6.7895882352941195
              precision    recall  f1-score   support

           1       0.93      0.93      0.93   1696788
           2       0.93      0.93      0.93   1703212

    accuracy                           0.93   3400000
   macro avg       0.93      0.93      0.93   3400000
weighted avg       0.93      0.93      0.93   3400000



In [14]:
X_test, y_test = df_test[2], df_test[0]
print("Total test samples: ", len(X_test))

X_test_tfidf = tfidfVect.transform(X_test)
predicted = clf.predichttp://34.66.19.11:5000/notebooks/research/rakuten-binary-classification/rakuten-japanese-binary-classification.ipynb#t(X_test_tfidf)
print("Error rate (LR-SP)", (1-np.mean(predicted == y_test))*100)
print(classification_report(predicted, y_test, labels=[1,2]))

Total test samples:  400000
Error rate (LR-SP) 7.065750000000004
              precision    recall  f1-score   support

           1       0.93      0.93      0.93    199665
           2       0.93      0.93      0.93    200335

    accuracy                           0.93    400000
   macro avg       0.93      0.93      0.93    400000
weighted avg       0.93      0.93      0.93    400000

