In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
pip install emot

In [None]:
import emoji
from emot.emo_unicode import EMOTICONS

import re

from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns

# Explore Data

In [None]:
df_train = pd.read_csv('/kaggle/input/shopee-sentiment-analysis/train.csv')
df_test = pd.read_csv('/kaggle/input/shopee-sentiment-analysis/test.csv')

In [None]:
df_train

In [None]:
df_test

# Convert Emoji and Emoticon to Word

> Convert Emoji and Emoticon

In [None]:
def count_text_have_emoji(text_list):
    count=0
    for idx,review in enumerate(text_list):
        if any(char in emoji.UNICODE_EMOJI for char in review):
            count+=1
    return count

def clean_repeated_word(text):
    tokenizer = text.split()
    repeated_word = []
    
    for word in tokenizer:
        if word not in repeated_word:
            repeated_word.append(word)
            
    text = ' '.join(repeated_word)
    
    return text

def convert_emoji(text):
    text = emoji.demojize(text).replace(':', ' ')
    text = clean_repeated_word(text)
    text = text.replace('_', ' ').replace('-', ' ')
    
    return text

def convert_emoticon(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', " ".join(EMOTICONS[emot].replace(",","").split()), text)
    text = clean_repeated_word(text)
    text = text.replace('_', ' ').replace('-', ' ')
    
    return text

In [None]:
print('--Before cleaning--')
print('Count df_train with emoji = {}'.format(count_text_have_emoji(df_train['review'])))
print('Count df_test with emoji = {}'.format(count_text_have_emoji(df_test['review'])))

df_train['review'] = [convert_emoji(text) for text in df_train['review']]
df_test['review'] = [convert_emoji(text) for text in df_test['review']]

print('\n--After cleaning--')
print('Count df_train with emoji = {}'.format(count_text_have_emoji(df_train['review'])))
print('Count df_test with emoji = {}'.format(count_text_have_emoji(df_test['review'])))

In [None]:
print('Cleaning emoticon in train and test...')

#df_train['review'] = [convert_emoticon(text) for text in df_train['review']]
#df_test['review'] = [convert_emoticon(text) for text in df_test['review']]

print('Finished...')

# Clean Text

In [None]:
def clean_text(text):
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = text = re.sub('[^a-z0-9]', ' ', text)
    tokenizer = text.split()
    text = ' '.join(tokenizer)
    
    return text

def clean_repeated_char(text):
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    
    return text

def clean_shortened_word(text):
    text = re.sub(r'\bapaa\b', 'apa', text)
    text = re.sub(r'\bbsk\b', 'besok', text)
    text = re.sub(r'\bbrngnya\b', 'barangnya', text)
    text = re.sub(r'\bbrp\b', 'berapa', text)
    text = re.sub(r'\bbgt\b', 'banget', text)
    text = re.sub(r'\bbngt\b', 'banget', text)
    text = re.sub(r'\bgini\b', 'begini', text)
    text = re.sub(r'\bbrg\b', 'barang', text)
    text = re.sub(r'\bdtg\b', 'datang', text)
    text = re.sub(r'\bd\b', 'di', text)
    text = re.sub(r'\bsdh\b', 'sudah', text)
    text = re.sub(r'\bdri\b', 'dari', text)
    text = re.sub(r'\bdsni\b', 'disini', text)
    text = re.sub(r'\bgk\b', 'gak', text)
    text = re.sub(r'\bhrs\b', 'harus', text)
    text = re.sub(r'\bjd\b', 'jadi', text)
    text = re.sub(r'\bjg\b', 'juga', text)
    text = re.sub(r'\bjgn\b', 'jangan', text)
    text = re.sub(r'\blg\b', 'lagi', text)
    text = re.sub(r'\blgi\b', 'lagi', text)
    text = re.sub(r'\blbh\b', 'lebih', text)
    text = re.sub(r'\blbih\b', 'lebih', text)
    text = re.sub(r'\bmksh\b', 'makasih', text)
    text = re.sub(r'\bmna\b', 'mana', text)
    text = re.sub(r'\borg\b', 'orang', text)
    text = re.sub(r'\bpjg\b', 'panjang', text)
    text = re.sub(r'\bka\b', 'kakak', text)
    text = re.sub(r'\bkk\b', 'kakak', text)
    text = re.sub(r'\bklo\b', 'kalau', text)
    text = re.sub(r'\bkmrn\b', 'kemarin', text)
    text = re.sub(r'\bkmrin\b', 'kemarin', text)
    text = re.sub(r'\bknp\b', 'kenapa', text)
    text = re.sub(r'\bkcil\b', 'kecil', text)
    text = re.sub(r'\bgmn\b', 'gimana', text)
    text = re.sub(r'\bgmna\b', 'gimana', text)
    text = re.sub(r'\btp\b', 'tapi', text)
    text = re.sub(r'\btq\b', 'thanks', text)
    text = re.sub(r'\btks\b', 'thanks', text)
    text = re.sub(r'\btlg\b', 'tolong', text)
    text = re.sub(r'\bgk\b', 'tidak', text)
    text = re.sub(r'\bgak\b', 'tidak', text)
    text = re.sub(r'\bgpp\b', 'tidak apa apa', text)
    text = re.sub(r'\bgapapa\b', 'tidak apa apa', text)
    text = re.sub(r'\bga\b', 'tidak', text)
    text = re.sub(r'\btgl\b', 'tanggal', text)
    text = re.sub(r'\btggl\b', 'tanggal', text)
    text = re.sub(r'\bgamau\b', 'tidak mau', text)
    text = re.sub(r'\bsy\b', 'saya', text)
    text = re.sub(r'\bsis\b', 'sister', text)
    text = re.sub(r'\bsdgkan\b', 'sedangkan', text)
    text = re.sub(r'\bmdh2n\b', 'semoga', text)
    text = re.sub(r'\bsmoga\b', 'semoga', text)
    text = re.sub(r'\bsmpai\b', 'sampai', text)
    text = re.sub(r'\bnympe\b', 'sampai', text)
    text = re.sub(r'\bdah\b', 'sudah', text)
    text = re.sub(r'\bberkali2\b', 'repeated', text)
    text = re.sub(r'\byg\b', 'yang', text)
    
    return text

In [None]:
print('Cleaning text, repeated char and shortened words...')

df_train['review'] = [clean_text(text) for text in df_train['review']]
df_test['review'] = [clean_text(text) for text in df_test['review']]

df_train['review'] = [clean_repeated_char(text) for text in df_train['review']]
df_test['review'] = [clean_repeated_char(text) for text in df_test['review']]

df_train['review'] = [clean_shortened_word(text) for text in df_train['review']]
df_test['review'] = [clean_shortened_word(text) for text in df_test['review']]

print('Finished...')

# Analyse and Visualize Data

In [None]:
rating_1 = df_train[df_train['rating'] == 1]['review']
rating_3 = df_train[df_train['rating'] == 3]['review']
rating_5 = df_train[df_train['rating'] == 5]['review']

rating_1_text = ' '.join([text for text in rating_1])
rating_3_text = ' '.join([text for text in rating_3])
rating_5_text = ' '.join([text for text in rating_5])

In [None]:
rating_1_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='magma', max_words=80).generate(rating_1_text)

plt.imshow(rating_1_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
rating_3_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='magma', max_words=80).generate(rating_3_text)

plt.imshow(rating_3_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
rating_5_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='magma', max_words=80).generate(rating_5_text)

plt.imshow(rating_5_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
ax = sns.countplot(df_train['rating'])

for val in ax.patches:
    pct = '{:.2f}%'.format(100 * val.get_height() / df_train.shape[0])
    xpos = val.get_x() + val.get_width() / 2.
    ypos = val.get_height()
    ax.annotate(pct, (xpos, ypos), ha='center', va='center', fontsize=14, xytext=(0, 12), textcoords='offset points')
    
plt.title('Rating comparison', fontsize=24, pad=15)
plt.xlabel('rating', labelpad=18)
plt.tight_layout()
plt.show()

# Train Model and Predict

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

print('Using Tensorflow version:', tf.__version__)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
             texts, 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=True,
             max_length=maxlen)
    
    return np.array(enc_di['input_ids'])

def build_model(transformer, max_len=512):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(5, activation='softmax')(cls_token) # 5 ratings to predict
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

EPOCHS = 4
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MODEL = 'jplu/tf-xlm-roberta-large'

In [None]:
rating_mapper_encode = {1: 0,
                        2: 1,
                        3: 2,
                        4: 3,
                        5: 4}

rating_mapper_decode = {0: 1,
                        1: 2,
                        2: 3,
                        3: 4,
                        4: 5}

df_train['rating'] = df_train['rating'].map(rating_mapper_encode)

In [None]:
from tensorflow.keras.utils import to_categorical

train_labels = to_categorical(df_train['rating'], num_classes=5)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df_train['review'],
                                                  train_labels,
                                                  stratify=train_labels,
                                                  test_size=0.1,
                                                  random_state=2020)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
MAX_LEN = 192

X_train = regular_encode(X_train.values, tokenizer, maxlen=MAX_LEN)
X_val = regular_encode(X_val.values, tokenizer, maxlen=MAX_LEN)
X_test = regular_encode(df_test['review'].values, tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(1024)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
n_steps = X_train.shape[0] // BATCH_SIZE

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
plt.style.use('fivethirtyeight')

training_loss = train_history.history['loss']
test_loss = train_history.history['val_loss']

epoch_count = range(1, len(training_loss) + 1)

plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
pred = model.predict(test_dataset, verbose=1)

In [None]:
np.save('xlm-roberta', pred)

In [None]:
pred_sentiment = np.argmax(pred, axis=1)

print(pred_sentiment)

# Submission

In [None]:
submission = pd.DataFrame({'review_id': df_test['review_id'],
                           'rating': pred_sentiment})

In [None]:
submission['rating'] = submission['rating'].map(rating_mapper_decode)

submission.to_csv('submission_last.csv', index=False)