In [14]:
import pandas as pd
import numpy as np
import sentencepiece as spm
from sudachipy import tokenizer
from sudachipy import dictionary
import MeCab
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error

In [2]:
df_train = pd.read_csv('../data/rakuten-sentiment-dataset/full/full_train.csv', header=None)
df_test = pd.read_csv('../data/rakuten-sentiment-dataset/full/full_test.csv', header=None)

In [3]:
df_train.head()

Unnamed: 0,0,1,2
0,2,^^,いいです＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾＾
1,1,,ｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｂｋ
2,1,!!!まいった！,そうりょうむりょうじゃないーーーーーー！やられた！
3,3,////////////////////////////////////////,ｐｐｐ・・・・・・・・・・・・・/////////////////////////////
4,1,...こういうのって、こんなもの？,つぼみがついてましたが、ちいさくてくさってました。もうかいません。


In [4]:
print("Total training samples: ", len(df_train))
print("Total testing samples: ", len(df_test))

Total training samples:  4000000
Total testing samples:  500000


In [7]:
sp = spm.SentencePieceProcessor(model_file='./sp-model-32000-340k.model')

def tokenize(text):
    tokenized = sp.encode(text, out_type=str)
    return tokenized

In [8]:
X_train, y_train = df_train[2], df_train[0]
print("Total train samples: ", len(X_train))

Total train samples:  4000000


In [9]:
tfidfVect = TfidfVectorizer(tokenizer=tokenize)
start = time.time()
X_train_tfidf = tfidfVect.fit_transform(X_train)
end = time.time()
print("TFIDF Vect time (SentencePiece): ", end-start)
pickle.dump(X_train_tfidf, open("tfidf_features.pickle", "wb"))
pickle.dump(tfidfVect, open("tfidf_vectorizer.pickle", "wb"))

TFIDF Vect time (SentencePiece):  301.7839548587799


In [None]:
# tfidfVect = pickle.load(open("tfidf_vectorizer.pickle", "rb"))
# X_train_tfidf = pickle.load(open("tfidf_features.pickle", "rb"))

In [10]:
start = time.time()
clf = LogisticRegression(random_state=0, C=10, penalty='l2', solver='lbfgs').fit(X_train_tfidf, y_train)
end = time.time()
print("Training time (LR-SP): ", end-start)

Training time (LR-SP):  332.0775535106659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
print("Total train samples: ", len(X_train))

predicted = clf.predict(X_train_tfidf)
print("Train Error rate (LR-SP)", (1-np.mean(predicted == y_train))*100)
print(classification_report(predicted, y_train, labels=[1,2,3,4,5]))
print(mean_absolute_error(y_train, predicted))

Total train samples:  4000000
Train Error rate (LR-SP) 46.863049999999994
              precision    recall  f1-score   support

           1       0.69      0.65      0.67    850117
           2       0.48      0.48      0.48    803633
           3       0.41      0.43      0.42    770969
           4       0.37      0.49      0.42    602204
           5       0.71      0.58      0.64    973077

    accuracy                           0.53   4000000
   macro avg       0.53      0.52      0.53   4000000
weighted avg       0.55      0.53      0.54   4000000

0.61495075


In [16]:
X_test, y_test = df_test[2], df_test[0]
print("Total test samples: ", len(X_test))

X_test_tfidf = tfidfVect.transform(X_test)
predicted = clf.predict(X_test_tfidf)
print("Error rate (LR-SP)", (1-np.mean(predicted == y_test))*100)
print(classification_report(predicted, y_test, labels=[1,2,3,4,5]))
print(mean_absolute_error(y_test, predicted))

Total test samples:  500000
Error rate (LR-SP) 47.608
              precision    recall  f1-score   support

           1       0.69      0.65      0.67    106413
           2       0.47      0.47      0.47    100344
           3       0.40      0.42      0.41     96532
           4       0.36      0.48      0.41     74888
           5       0.70      0.58      0.63    121823

    accuracy                           0.52    500000
   macro avg       0.52      0.52      0.52    500000
weighted avg       0.54      0.52      0.53    500000

0.622832
