In [1]:
# Library
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt

import seaborn as sns
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import chi2, f_classif

from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score


import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read files
train = pd.read_csv("../input/shopee-sentiment-analysis/train.csv")
test_df = pd.read_csv("../input/shopee-sentiment-analysis/test.csv")
add_train = pd.read_csv("../input/chineseenglishfasttext/790393_1357444_compressed_test_labelled.csv/test_labelled.csv")

# concat train add_train
train = pd.concat([train, add_train], axis=0)

# shuffle rows
train_df = train.sample(frac=1.).reset_index(drop=True)

train_df.drop('review_id', axis=1, inplace=True)

In [3]:
import emoji  # https://pypi.org/project/emoji/

have_emoji_train_idx = []
have_emoji_test_idx = []

for idx, review in enumerate(train_df['review']):
    if any(char in emoji.UNICODE_EMOJI for char in review):
        have_emoji_train_idx.append(idx)
        
for idx, review in enumerate(test_df['review']):
    if any(char in emoji.UNICODE_EMOJI for char in review):
        have_emoji_test_idx.append(idx)

In [4]:
train_emoji_percentage = round(len(have_emoji_train_idx) / train_df.shape[0] * 100, 2)
print(f'Train data has {len(have_emoji_train_idx)} rows that used emoji, that means {train_emoji_percentage} percent of the total')

test_emoji_percentage = round(len(have_emoji_test_idx) / test_df.shape[0] * 100, 2)
print(f'Test data has {len(have_emoji_test_idx)} rows that used emoji, that means {test_emoji_percentage} percent of the total')

Train data has 28566 rows that used emoji, that means 13.62 percent of the total
Test data has 7582 rows that used emoji, that means 12.55 percent of the total


In [5]:
def emoji_cleaning(text):
    
    # Change emoji to text
    text = emoji.demojize(text).replace(":", " ")
    
    # Delete repeated emoji
    tokenizer = text.split()
    repeated_list = []
    
    for word in tokenizer:
        if word not in repeated_list:
            repeated_list.append(word)
    
    text = ' '.join(text for text in repeated_list)
    text = text.replace("_", " ").replace("-", " ")
    return text

train_df_original = train_df.copy()
test_df_original = test_df.copy()

# emoji_cleaning
train_df.loc[have_emoji_train_idx, 'review'] = train_df.loc[have_emoji_train_idx, 'review'].apply(emoji_cleaning)
test_df.loc[have_emoji_test_idx, 'review'] = test_df.loc[have_emoji_test_idx, 'review'].apply(emoji_cleaning)

In [6]:
def review_cleaning(text):
    
    # lowercase and delete newline
    text = text.lower()
    text = re.sub(r'\n', '', text)
    
    # change emoticon to text
    text = re.sub(r':\(', 'dislike', text)
    text = re.sub(r': \(\(', 'dislike', text)
    text = re.sub(r':, \(', 'dislike', text)
    text = re.sub(r':\)', 'smile', text)
    text = re.sub(r';\)', 'smile', text)
    text = re.sub(r':\)\)\)', 'smile', text)
    text = re.sub(r':\)\)\)\)\)\)', 'smile', text)
    text = re.sub(r'=\)\)\)\)', 'smile', text)
    
    # delete punctuation
    text = re.sub('[^a-z0-9 ]', ' ', text)
    
    tokenizer = text.split()
    
    return ' '.join([text for text in tokenizer])

train_df['review'] = train_df['review'].apply(review_cleaning)
test_df['review'] = test_df['review'].apply(review_cleaning)

In [7]:
repeated_rows_train = []
repeated_rows_test = []

for idx, review in enumerate(train_df['review']):
    if re.match(r'\w*(\w)\1+', review):
        repeated_rows_train.append(idx)
        
for idx, review in enumerate(test_df['review']):
    if re.match(r'\w*(\w)\1+', review):
        repeated_rows_test.append(idx)

In [8]:
def delete_repeated_char(text):
    text = re.sub(r'(\w)\1{2,}', r'\1', text)    
    return text

train_df.loc[repeated_rows_train, 'review'] = train_df.loc[repeated_rows_train, 'review'].apply(delete_repeated_char)
test_df.loc[repeated_rows_test, 'review'] = test_df.loc[repeated_rows_test, 'review'].apply(delete_repeated_char)

In [9]:
def recover_shortened_words(text):
    
    # put \b (boundary) for avoid the characters in the word to be replaced
    # I only make a few examples here, you can add if you're interested :)
    
    text = re.sub(r'\bapaa\b', 'apa', text)
    
    text = re.sub(r'\bbsk\b', 'besok', text)
    text = re.sub(r'\bbrngnya\b', 'barangnya', text)
    text = re.sub(r'\bbrp\b', 'berapa', text)
    text = re.sub(r'\bbgt\b', 'banget', text)
    text = re.sub(r'\bbngt\b', 'banget', text)
    text = re.sub(r'\bgini\b', 'begini', text)
    text = re.sub(r'\bbrg\b', 'barang', text)
    
    text = re.sub(r'\bdtg\b', 'datang', text)
    text = re.sub(r'\bd\b', 'di', text)
    text = re.sub(r'\bsdh\b', 'sudah', text)
    text = re.sub(r'\bdri\b', 'dari', text)
    text = re.sub(r'\bdsni\b', 'disini', text)
    
    text = re.sub(r'\bgk\b', 'gak', text)
    
    text = re.sub(r'\bhrs\b', 'harus', text)
    
    text = re.sub(r'\bjd\b', 'jadi', text)
    text = re.sub(r'\bjg\b', 'juga', text)
    text = re.sub(r'\bjgn\b', 'jangan', text)
    
    text = re.sub(r'\blg\b', 'lagi', text)
    text = re.sub(r'\blgi\b', 'lagi', text)
    text = re.sub(r'\blbh\b', 'lebih', text)
    text = re.sub(r'\blbih\b', 'lebih', text)
    
    text = re.sub(r'\bmksh\b', 'makasih', text)
    text = re.sub(r'\bmna\b', 'mana', text)
    
    text = re.sub(r'\borg\b', 'orang', text)
    
    text = re.sub(r'\bpjg\b', 'panjang', text)
    
    text = re.sub(r'\bka\b', 'kakak', text)
    text = re.sub(r'\bkk\b', 'kakak', text)
    text = re.sub(r'\bklo\b', 'kalau', text)
    text = re.sub(r'\bkmrn\b', 'kemarin', text)
    text = re.sub(r'\bkmrin\b', 'kemarin', text)
    text = re.sub(r'\bknp\b', 'kenapa', text)
    text = re.sub(r'\bkcil\b', 'kecil', text)
    
    text = re.sub(r'\bgmn\b', 'gimana', text)
    text = re.sub(r'\bgmna\b', 'gimana', text)
    
    text = re.sub(r'\btp\b', 'tapi', text)
    text = re.sub(r'\btq\b', 'thanks', text)
    text = re.sub(r'\btks\b', 'thanks', text)
    text = re.sub(r'\btlg\b', 'tolong', text)
    text = re.sub(r'\bgk\b', 'tidak', text)
    text = re.sub(r'\bgak\b', 'tidak', text)
    text = re.sub(r'\bgpp\b', 'tidak apa apa', text)
    text = re.sub(r'\bgapapa\b', 'tidak apa apa', text)
    text = re.sub(r'\bga\b', 'tidak', text)
    text = re.sub(r'\btgl\b', 'tanggal', text)
    text = re.sub(r'\btggl\b', 'tanggal', text)
    text = re.sub(r'\bgamau\b', 'tidak mau', text)
    
    text = re.sub(r'\bsy\b', 'saya', text)
    text = re.sub(r'\bsis\b', 'sister', text)
    text = re.sub(r'\bsdgkan\b', 'sedangkan', text)
    text = re.sub(r'\bmdh2n\b', 'semoga', text)
    text = re.sub(r'\bsmoga\b', 'semoga', text)
    text = re.sub(r'\bsmpai\b', 'sampai', text)
    text = re.sub(r'\bnympe\b', 'sampai', text)
    text = re.sub(r'\bdah\b', 'sudah', text)
    
    text = re.sub(r'\bberkali2\b', 'repeated', text)
    
    text = re.sub(r'\byg\b', 'yang', text)
    
    return text

In [10]:
%%time
train_df['review'] = train_df['review'].apply(recover_shortened_words)

CPU times: user 37.8 s, sys: 13.4 ms, total: 37.9 s
Wall time: 37.9 s


In [11]:
X_train, X_val, y_train, y_val = train_test_split(train['review'], train['rating'], test_size=0.15, stratify=train['rating'], random_state=101 )

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((178269,), (31460,), (178269,), (31460,))

In [17]:
# Evaluation function
def evaluate_metrics(y_val, y_pred):
    acc = accuracy_score(y_val, y_pred)
    f1_macro = f1_score(y_val, y_pred, average='macro')
    f1_micro= f1_score(y_val, y_pred, average='micro')
    
    print("Validation Score")
    print("Accuracy :", acc)
    print("F1 Macro :", f1_macro)
    print("F1 Micro :", f1_micro)
    
def evaluate_proba_metrics(y_val, y_pred):
    auc = roc_auc_score(y_val, y_pred)
    
    print("Validation Score")
    print("ROC AUC :", auc)
    
def training_evaluate(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    evaluate_metrics(y_val, y_pred)
    return model, y_pred

# the threshold for prediction is 0.31

In [21]:
1 / train_df.rating.value_counts(normalize=True)
# wgts = {1: 0.100706,
#         2: 0.086540,
#         3: 0.244811,
#         4: 0.285163,
#         5: 0.282779}
# wgts

4     5.260145
5     5.304492
3     6.127172
1    14.894820
2    17.332975
Name: rating, dtype: float64

In [25]:
wgts = {1: 9.929880,
        2: 11.555317,
        3: 4.084781,
        4: 3.506763,
        5: 3.536328}
wgts

{1: 9.92988, 2: 11.555317, 3: 4.084781, 4: 3.506763, 5: 3.536328}

In [29]:
# Baseline model + preprocessed + cleaning
# plain TfIdfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,5))
vectorizer.fit(X_train)

train_vct = vectorizer.transform(X_train)
val_vct = vectorizer.transform(X_val)

# Model
model = LogisticRegression(C=0.9, class_weight=wgts)

# Train and Evaluate
lr, y_pred = training_evaluate(model, train_vct, val_vct, y_train, y_val)
# ngram 1,2 0.538270820089002
# ngram 1,5 0.562555626191989 norm l2
# ngram 1,5 selectkbest 5000 chi2 0.483566433
# ngram 1,5 selectkbest 10000 chi2 0.483566433 -> not works selectkbest
# regularization strength 0.4 0.55
# regularization strength 0.1 not work
# add weight class 1 0.56303
# add weight class 1.5 0.558963
# add weight class 1 C 0.9 0.5685632
# add weight class 1 C 0.95 0.5652574
# add weight class 1 C 0.85 0.564589

Validation Score
Accuracy : 0.5685632549268913
F1 Macro : 0.5800024764978338
F1 Micro : 0.5685632549268913


In [30]:
test_vct = vectorizer.transform(test_df['review'])
y_pred = lr.predict(test_vct)

In [None]:
df_error = pd.concat([y_val, pd.Series(y_pred, index=y_val.index, name='prediction')], axis=1)
pd.pivot_table(df_error, index='prediction', columns='rating', values='rating', aggfunc=len) 
# df_error

In [None]:
pd.pivot_table(df_error, index='rating', columns='prediction', values='rating', aggfunc=len) / df_error.prediction.count()

In [None]:
pd.pivot_table(df_error, index='rating', columns='prediction', values='rating', aggfunc=len) / df_error.prediction.count()

In [None]:
df_y_test.rating.value_counts(normalize=True)

# vanila tfidf & logreg in subset training 
# 5    0.346832
# 4    0.337713
# 3    0.198074
# 1    0.083721
# 2    0.033660

# two steps prediction
# 5    0.350506
# 4    0.328032
# 3    0.204975
# 1    0.081884
# 2    0.034604

# fasttext tuned
# 4    0.419696
# 3    0.263690
# 5    0.185166
# 1    0.079501
# 2    0.051947

# additional train; tfidf ngram1,5; plain logreg
# 5    0.340676
# 4    0.319278
# 3    0.199017
# 1    0.117034
# 2    0.023996

### Prepare Submission

In [31]:
submission = pd.read_csv("../input/shopee-sentiment-analysis/test.csv")
submission.drop('review', axis=1, inplace=True)
submission['rating'] = y_pred
submission.to_csv("submission.csv", index=False)

In [32]:
submission.rating.value_counts(normalize=True)

5    0.328347
4    0.289275
3    0.184222
1    0.143131
2    0.055025
Name: rating, dtype: float64

In [None]:
submission.head()

In [None]:
# vanila tfidf & logreg in subset training -> 0.41
# 5    0.346832
# 4    0.337713
# 3    0.198074
# 1    0.083721
# 2    0.033660

# two steps prediction -> 0.40
# 5    0.350506
# 4    0.328032
# 3    0.204975
# 1    0.081884
# 2    0.034604

# fasttext tuned
# 4    0.419696
# 3    0.263690
# 5    0.185166
# 1    0.079501
# 2    0.051947