# Sentiment Analysis for Instagram Comment on @telkomuniversity IG post using RNN

## Import Dependencies

In [141]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')
import re
from re import sub
import multiprocessing

import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from time import time 
from collections import defaultdict
import emoji
from unidecode import unidecode
from translate import Translator

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.models import model_from_json
from keras.models import load_model

## Read Data

In [65]:
scraped_data = pd.DataFrame()
path = 'Scraper'
BASE_DIR = "D:\\kuliah\\tugas\\EDM\\TEXT MINING"
directory = os.path.join(BASE_DIR,path)
os.chdir(directory)
for root,dirs,files in os.walk(directory):
    for file in files:
        if file.endswith(".csv"):
            f = pd.read_csv(file)
            scraped_data = scraped_data.append(f,ignore_index=True)
display(scraped_data)
os.chdir(BASE_DIR)

Unnamed: 0,id,text,created_at,did_report_as_spam,owner_id,owner_username,owner_is_verified,viewer_has_liked,likes
0,17879474824580432,https://bit.ly/RegistWorkshopTeknisUjianDaring...,1586664958,False,1485219132,fanji_farman,False,False,0
1,17990924800291559,kuota mana kuota,1586666340,False,3607466523,hasfiamr,False,False,0
2,17857930756845161,Subsidi kuota mana,1586669461,False,2707251224,rxvntx,False,False,0
3,17880801838577206,Kuota mana kuota min @telkomuniversity,1586669607,False,1724669055,dhiyulhaqq,False,False,0
4,17860736335828000,Kuota min😂,1586677079,False,1367829296,andi.joo,False,False,0
...,...,...,...,...,...,...,...,...,...
1712,17877210811562582,🔥🔥🔥,1584336186,False,180186848,nafisaagnia,False,False,0
1713,17857687792804193,backsoudnya lo-fi gini,1584336201,False,1162027684,rahmeeen,False,False,4
1714,17854177237939849,min saya mau tny sesuatu udh dm ig mention twi...,1586960336,False,52018555,syafiramhrn,False,False,0
1715,17903878132450528,Ayo ikut hel. We are an international students...,1586988920,False,2118962725,distywisdayani,False,False,0


## Exploratory Data Analysis

In [22]:
scraped_data.sample(15)

Unnamed: 0,id,text,created_at,did_report_as_spam,owner_id,owner_username,owner_is_verified,viewer_has_liked,likes
1501,17862573265728807,"Ciee yg matiin kolom komenannya, 👏",1584053649,False,7459857862,baymaxnduttt,False,False,0
493,17842702643093782,@m_sabrimas hore hore,1585830640,False,2055783526,fthrhman,False,False,0
1621,17882133808523119,Cikoneng kak butuh disemprot jugaa,1584389834,False,1696111188,javier_aditama,False,False,0
288,17886245134526165,❤,1585880184,False,3683040759,fid.ah_,False,False,0
449,17870512807658486,ASEKKKK,1585832100,False,1011984009,raraamrian,False,False,0
558,18136438714028424,"kuliah cuman 2 bulan ga full satu semester, me...",1585829382,False,224319327,evasnaa,False,False,0
336,17861146981799822,@upiofficial,1585843122,False,548823434,dp.shrt_,False,False,0
1537,17857036513832492,Di zoom ngeblur,1584284487,False,1315063640,andrianaji_,False,False,0
119,17867843527676012,🥺🥺🥺🥺,1584993023,False,1498831648,dwiidaayuu,False,False,0
1458,18125470330068323,Yaa kirain lulusan telkom,1584536208,False,4829020597,_echomusic_,False,False,1


In [17]:
scraped_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1717 entries, 0 to 1716
Data columns (total 9 columns):
id                    1717 non-null object
text                  1717 non-null object
created_at            1717 non-null object
did_report_as_spam    1717 non-null object
owner_id              1717 non-null object
owner_username        1717 non-null object
owner_is_verified     1717 non-null object
viewer_has_liked      1717 non-null object
likes                 1717 non-null object
dtypes: object(9)
memory usage: 120.9+ KB


In [19]:
scraped_data.describe(include=['O'])

Unnamed: 0,id,text,created_at,did_report_as_spam,owner_id,owner_username,owner_is_verified,viewer_has_liked,likes
count,1717,1717,1717,1717,1717,1717,1717,1717,1717
unique,1672,1585,1651,1,1386,1386,1,1,15
top,18058922635204086,Alhamdulillah,1585827863,False,9125683,adesatrio,False,False,0
freq,2,13,3,1717,11,11,1717,1717,1294


In [21]:
scraped_data.isna().sum()

id                    0
text                  0
created_at            0
did_report_as_spam    0
owner_id              0
owner_username        0
owner_is_verified     0
viewer_has_liked      0
likes                 0
dtype: int64

Yang dapat disimpulkan :
- Terdapat emoji pada column text (challenge dalam penentuan label)
- User yang sering comment pada post Telkomuniversity adalah adesatrio dengan frekuensi 11 kali
- Ada komentar yang sama dan dipost berulang oleh akun yang berbeda
- Tidak ada missing values, tapi cukup banyak noise pada kolom text

## Cleansing and Slicing

In [94]:
input_data = scraped_data.dropna().reset_index(drop=True).loc[:,'text']
input_data

0       https://bit.ly/RegistWorkshopTeknisUjianDaring...
1                                        kuota mana kuota
2                                      Subsidi kuota mana
3                  Kuota mana kuota min @telkomuniversity
4                                              Kuota min😂
                              ...                        
1712                                                  🔥🔥🔥
1713                               backsoudnya lo-fi gini
1714    min saya mau tny sesuatu udh dm ig mention twi...
1715    Ayo ikut hel. We are an international students...
1716                                 Kuota gemana min? :(
Name: text, Length: 1717, dtype: object

In [86]:
emoji_enc = {}
for idx, row in pd.read_csv('emojis.csv').iterrows():
    emoji_enc[row.emoji] = row.meaning
def replace_emoji(text, emoji_enc):
    SMILEY = {
        ":(" : "sedih",
        ":)" : "bahagia",
        ":/" : "skeptis",
        ":p" : "ejek",
        ":P" : "ejek",
        ":-(" : "sedih",
        ":-)" : "bahagia",
        ":-/" : "skeptis",
        ":-p" : "ejek",
        ":-P" : "ejek",
    }
    SMILEY = {**SMILEY, **emoji_enc}
    words = text.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    hasil = " ".join(reformed)
    
    hasil = emoji.demojize(hasil)
    hasil = hasil.replace(":"," ")
    hasil = " ".join(hasil.split())
    
    return hasil
def remove_punctuations(text):
    return " ".join(re.sub("[\.\,\!\?\:\;\-\=]", " ",text).split()).replace('_', ' ')
def remove_hashtags(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text).split())
def remove_urls(text):
    return ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())
def get_lower(text):
    return text.lower()


In [76]:
replace_emoji("ðŸ˜Ž")

'A_sunglasses'

In [88]:
many_text = input_data.to_list()
many_text = [text.lower() for text in many_text]
many_text = [remove_urls(text) for text in many_text]
many_text = [replace_emoji(text, emoji_enc=emoji_enc) for text in many_text]
many_text = [remove_punctuations(text) for text in many_text]
many_text = [remove_hashtags(text) for text in many_text]

In [158]:
input_data = pd.Series((v for v in many_text))
display(input_data)

0                                     g bsa d buka ya min
1                                        kuota mana kuota
2                                      subsidi kuota mana
3                                    kuota mana kuota min
4                        kuota min face with tears of joy
                              ...                        
1712                                       fire fire fire
1713                               backsoudnya lo fi gini
1714    min saya mau tny sesuatu udh dm ig mention twi...
1715    ayo ikut hel we are an international students ...
1716                               kuota gemana min sedih
Length: 1717, dtype: object

## Preprocessing

### Cluster-based Labeling

In [99]:
def text_to_word_list(text, remove_polish_letters):
    text = remove_polish_letters(text)
    text = str(text)
    text = text.split()
    return text

In [111]:
input_data = pd.Series((v for v in many_text))
input_data = input_data.apply(lambda x: text_to_word_list(x, unidecode))
print(type(input_data))

<class 'pandas.core.series.Series'>


In [115]:
sent = [row for row in input_data]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[509]

['terimakasih', 'bapakk', 'fire', 'red_heart']

In [120]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [121]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 0.02 mins


In [122]:
w2v_model.save("word2vec.model")

#### K-Means Clustering

In [125]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [126]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [127]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('dan', 0.9215435981750488),
 ('untuk', 0.9080561995506287),
 ('tugas', 0.8848137259483337),
 ('banyaknya', 0.8431203365325928),
 ('loudly_crying', 0.8261879682540894),
 ('karena', 0.8209102153778076),
 ('pulang', 0.8135249614715576),
 ('ini', 0.8003915548324585),
 ('bisa', 0.7983108758926392),
 ('saja', 0.7980477213859558)]

In [128]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [129]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [130]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [131]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,g,"[-0.06031097, -0.03675199, -0.018923845, -0.02...",0,1,1.035019,1.035019
1,bsa,"[0.027507596, -0.014379414, -0.12620877, 0.071...",1,-1,1.135381,-1.135381
2,d,"[-0.073949814, 0.009203013, 0.0805453, -0.0780...",0,1,1.016141,1.016141
3,buka,"[-0.05532703, -0.02957031, -0.06876149, -0.013...",0,1,1.039991,1.039991
4,ya_min,"[-0.028316436, -0.09971798, 0.055358842, -0.03...",1,-1,1.112345,-1.112345
5,kuota_mana,"[0.0589151, -0.112560496, -0.0071403156, -0.00...",0,1,1.035034,1.035034
6,kuota,"[-0.010729889, -0.034996342, -0.07089649, -0.0...",1,-1,1.572891,-1.572891
7,subsidi_kuota,"[0.02866541, 0.05730002, 0.048568062, -0.04245...",0,1,1.023571,1.023571
8,mana,"[0.019140642, -0.100943975, -0.13423821, -0.02...",1,-1,1.346571,-1.346571
9,min,"[0.061102968, 0.0067842053, -0.013135753, 0.04...",1,-1,1.319433,-1.319433


In [132]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [184]:
final_file = input_data.copy().to_frame(name='text')
final_weighting = final_file.copy()
display(final_file.sample(15))
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

Unnamed: 0,text
1608,telyu
159,punten coba biaya bpp nya dialokasikan buat ku...
1411,langsung ciptakan dlu lgi pintunkemana saja bg...
79,subsidi tak ceritanya
197,yg ta gmn ya klo online saya rasa sulit susah ...
13,kuota nya kapan ya face with rolling eyes
34,
1660,kursi nya juga dong
1079,keren banget kampus gua
838,dipakai belajar ya bukan buat pubg yikyok sama...


In [185]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(final_file.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(final_file.text)

In [186]:
transformed

<1717x3131 sparse matrix of type '<class 'numpy.float64'>'
	with 11371 stored elements in Compressed Sparse Row format>

### Tokenization

### Stemming

### Filtering

## Feature Extraction

## Modeling

In [134]:
def create_model_rnn(weight_matrix, max_words, EMBEDDING_DIM):
    # create the model
    model = Sequential()
    model.add(Embedding(len(weight_matrix), EMBEDDING_DIM, weights=[weight_matrix], input_length=max_words, trainable=False))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.50))
    model.add(Dense(10, activation='softmax'))
    # Adam Optimiser
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    return model

In [135]:
def train_model(model,train_x, train_y, test_x, test_y, val_x, val_y, batch_size):
    # save the best model and early stopping
    saveBestModel = keras.callbacks.ModelCheckpoint('../best_weight_glove_bi_100d.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
    # Fit the model
    model.fit(train_x, train_y, batch_size=batch_size, epochs=25,validation_data=(val_x, val_y), callbacks=[saveBestModel, earlyStopping])
    # Final evaluation of the model
    score, acc = model.evaluate(test_x, test_y, batch_size=batch_size)
    return model

## Prediction