# Sentiment Analysis for Instagram Comment on @telkomuniversity IG post using RNN

## Import Dependencies

In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')
import re
from re import sub
import multiprocessing

import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from time import time 
from collections import defaultdict
import emoji
from unidecode import unidecode
from string import punctuation

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, SpectralClustering


## Read Data

In [2]:
scraped_data = pd.DataFrame()
path = 'Scraper'
BASE_DIR = "D:\\kuliah\\tugas\\EDM\\TEXT MINING"
directory = os.path.join(BASE_DIR,path)
os.chdir(directory)
for root,dirs,files in os.walk(directory):
    for file in files:
        if file.endswith(".csv"):
            f = pd.read_csv(file)
            scraped_data = scraped_data.append(f,ignore_index=True)
display(scraped_data)
os.chdir(BASE_DIR)

Unnamed: 0,id,text,created_at,did_report_as_spam,owner_id,owner_username,owner_is_verified,viewer_has_liked,likes
0,17879474824580432,https://bit.ly/RegistWorkshopTeknisUjianDaring...,1586664958,False,1485219132,fanji_farman,False,False,0
1,17990924800291559,kuota mana kuota,1586666340,False,3607466523,hasfiamr,False,False,0
2,17857930756845161,Subsidi kuota mana,1586669461,False,2707251224,rxvntx,False,False,0
3,17880801838577206,Kuota mana kuota min @telkomuniversity,1586669607,False,1724669055,dhiyulhaqq,False,False,0
4,17860736335828000,Kuota min😂,1586677079,False,1367829296,andi.joo,False,False,0
...,...,...,...,...,...,...,...,...,...
1712,17877210811562582,🔥🔥🔥,1584336186,False,180186848,nafisaagnia,False,False,0
1713,17857687792804193,backsoudnya lo-fi gini,1584336201,False,1162027684,rahmeeen,False,False,4
1714,17854177237939849,min saya mau tny sesuatu udh dm ig mention twi...,1586960336,False,52018555,syafiramhrn,False,False,0
1715,17903878132450528,Ayo ikut hel. We are an international students...,1586988920,False,2118962725,distywisdayani,False,False,0


## Exploratory Data Analysis

In [3]:
scraped_data = scraped_data.replace('NaN', np.nan)

In [4]:
scraped_data.sample(15)

Unnamed: 0,id,text,created_at,did_report_as_spam,owner_id,owner_username,owner_is_verified,viewer_has_liked,likes
645,18113745121102904,Makin sayang sama bapa ❤️❤️❤️❤️ @kangady,1585828351,False,1556989819,mahesarr,False,False,0
933,18004575091284999,"No nya udah ga aktif, tf aja. saya DM no rek s...",1585925236,False,7915826969,mfikrirmdhan,False,False,0
1630,17869444855635301,Akhirnya selama kurang lebih 3 taun berada di ...,1584489180,False,1529041114,imsoskinnyyyyyyyyyyyyyyyyyyyyy,False,False,0
1014,18113878633101084,Kalau mahasiswa yg masih di bandung kebagian j...,1586178826,False,6117438095,febismee,False,False,0
150,17849157523952205,@haikalaf,1584976445,False,2152997542,monaufal26,False,False,1
1015,17852044048952856,"Min, mahasiswa beserta jajarannya(keluarga) me...",1586187131,False,2075832073,jarhid_,False,False,0
389,18014324485275975,Semoga bisa TA dengan lancar guys @indahnfa_ @...,1585834378,False,884072715,avisagavrilla,False,False,2
965,17890534138484068,Makin lama makin keren kampus ini🔥🔥,1585988992,False,2229289653,shamariass,False,False,0
627,17843758793073798,Perbulan apa gimana nih pak :),1585828615,False,583993275,wwa1810,False,False,0
55,18130588510065992,@nadilandrry sisteuuurrr!!!!,1586925982,False,1539490457,firdha_tami,False,False,0


In [5]:
scraped_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1717 entries, 0 to 1716
Data columns (total 9 columns):
id                    1717 non-null int64
text                  1717 non-null object
created_at            1717 non-null int64
did_report_as_spam    1717 non-null bool
owner_id              1717 non-null int64
owner_username        1717 non-null object
owner_is_verified     1717 non-null bool
viewer_has_liked      1717 non-null bool
likes                 1717 non-null int64
dtypes: bool(3), int64(4), object(2)
memory usage: 85.6+ KB


In [6]:
scraped_data.describe(include=['O'])

Unnamed: 0,text,owner_username
count,1717,1717
unique,1585,1386
top,Alhamdulillah,adesatrio
freq,13,11


In [7]:
scraped_data.isna().sum()

id                    0
text                  0
created_at            0
did_report_as_spam    0
owner_id              0
owner_username        0
owner_is_verified     0
viewer_has_liked      0
likes                 0
dtype: int64

Yang dapat disimpulkan :
- Terdapat emoji pada column text (challenge dalam penentuan label)
- User yang sering comment pada post Telkomuniversity adalah adesatrio dengan frekuensi 11 kali
- Ada komentar yang sama dan dipost berulang oleh akun yang berbeda
- Tidak ada missing values, tapi cukup banyak noise pada kolom text

## Cleansing and Slicing

In [8]:
input_data = scraped_data.dropna().reset_index(drop=True).loc[:,'text']
input_data.to_csv('scraped_all_data.csv')

In [9]:
emoji_enc = {}
for idx, row in pd.read_csv('emojis.csv').iterrows():
    emoji_enc[row.emoji] = row.meaning
def replace_emoji(text, emoji_enc):
    SMILEY = {
        ":(" : "sedih",
        ":)" : "bahagia",
        ":/" : "skeptis",
        ":p" : "ejek",
        ":P" : "ejek",
        ":-(" : "sedih",
        ":-)" : "bahagia",
        ":-/" : "skeptis",
        ":-p" : "ejek",
        ":-P" : "ejek",
    }
    SMILEY = {**SMILEY, **emoji_enc}
    words = text.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    hasil = " ".join(reformed)
    
    hasil = emoji.demojize(hasil)
    hasil = hasil.replace(":"," ")
    hasil = " ".join(hasil.split())
    
    return hasil
def remove_punctuations(text):
    return " ".join(re.sub("[\.\,\!\?\:\;\-\=]", " ",text).split())
def remove_hashtags(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text).split())
def remove_urls(text):
    return ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())
def get_lower(text):
    return text.lower()


In [10]:
many_text = input_data.to_list()
many_text = [remove_urls(text) for text in many_text]
many_text = [text.lower() for text in many_text]
many_text = [replace_emoji(text, emoji_enc=emoji_enc) for text in many_text]
many_text = [remove_punctuations(text) for text in many_text]
many_text = [remove_hashtags(text) for text in many_text]
many_text = [text.replace('_', " ") for text in many_text]

In [11]:
input_data = pd.Series((v for v in many_text)).to_frame(name='text')
input_data = input_data.dropna()
input_data

Unnamed: 0,text
0,g bsa d buka ya min
1,kuota mana kuota
2,subsidi kuota mana
3,kuota mana kuota min
4,kuota min face with tears of joy
...,...
1712,fire fire fire
1713,backsoudnya lo fi gini
1714,min saya mau tny sesuatu udh dm ig mention twi...
1715,ayo ikut hel we are an international students ...


## Preprocessing

### Cluster-based Labeling

In [12]:
def text_to_word_list(text):
    text = str(text)
    text = text.split()
    return text

In [13]:
input_data.text = input_data.text.apply(lambda x: text_to_word_list(x))
word_list = input_data.copy()

In [14]:
sent = [row for row in word_list.text]
sentences = []
for row in sent:
    sentence = []
    for word in row:
        sentence.append(word)
    sentences.append(sentence)
sentences[0]

['g', 'bsa', 'd', 'buka', 'ya', 'min']

In [15]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [16]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 0.01 mins


In [17]:
w2v_model.save("word2vec.model")

In [18]:
file_export = word_list.copy()
file_export['old_comment'] = file_export.text
file_export.old_comment = file_export.old_comment.str.join(' ')
file_export.text = file_export.text.str.join(' ')
file_export['rate'] = np.zeros(len(file_export.text))

In [19]:
file_export[['text', 'rate']].to_csv('cleaned_dataset.csv', index=False)

####  Clustering

In [20]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [21]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [22]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=20, restrict_vocab=None)

[('dan', 0.9997967481613159),
 ('yg', 0.9997634887695312),
 ('yang', 0.9997410178184509),
 ('saya', 0.9997267723083496),
 ('untuk', 0.9997119903564453),
 ('ke', 0.999695897102356),
 ('baik', 0.9996850490570068),
 ('kami', 0.9996784925460815),
 ('ini', 0.9996772408485413),
 ('kita', 0.9996473789215088),
 ('tugas', 0.9996228218078613),
 ('di', 0.9996005296707153),
 ('temen', 0.9995882511138916),
 ('mau', 0.9995766878128052),
 ('luar', 0.9995729923248291),
 ('lebih', 0.9995472431182861),
 ('pak', 0.999531626701355),
 ('mereka', 0.9994966983795166),
 ('face', 0.9994845390319824),
 ('kondisi', 0.9994844198226929)]

In [23]:
negative_cluster_center = model.cluster_centers_[0]
positive_cluster_center = model.cluster_centers_[1]

In [24]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [25]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [26]:
words.loc[words['cluster']==1]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
482,skripsi,"[0.040842466, -0.08048584, 0.08156676, 0.07119...",1,-1,1.046402,-1.046402
522,nihh,"[-0.038431678, -0.07205526, 0.063154854, -0.07...",1,-1,1.047172,-1.047172
569,mabar,"[-0.051131383, -0.04649969, -0.04437185, 0.107...",1,-1,1.047215,-1.047215
580,gw,"[0.028070616, 0.046312366, 0.06287358, 0.01202...",1,-1,1.129936,-1.129936
584,ah,"[-0.058870878, 0.004142443, -0.017719058, 0.06...",1,-1,1.001101,-1.001101
611,beating,"[0.063533224, 0.0043178145, 0.08682576, 0.0215...",1,-1,1.117033,-1.117033
612,linkaja,"[-0.0775758, 0.024287991, -0.014779454, 0.0290...",1,-1,1.100714,-1.100714
628,kerenn,"[-0.077212155, -0.08251313, -0.046586927, 0.00...",1,-1,1.004586,-1.004586
644,alhamdulillahhh,"[-0.04113323, 0.0138619635, 0.061763275, -0.00...",1,-1,1.109582,-1.109582
645,bang,"[0.021993967, 0.09041176, -0.04497371, 0.04844...",1,-1,1.022358,-1.022358


In [27]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [28]:
final_file = pd.read_csv('cleaned_dataset.csv')
final_file = final_file.dropna(subset=['text'])
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

file_weighting = final_file.copy()
display(file_weighting.text.iloc[2])

'subsidi kuota mana'

In [29]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [30]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))
display(replaced_closeness_scores)

0       [2.436371903676394, 3.999863239625364, 4.61563...
1       [20.400002087964094, 7.75550262385769, 20.4000...
2       [16.424541165590494, 20.400002087964094, 7.755...
3       [20.400002087964094, 7.75550262385769, 20.4000...
4       [20.400002087964094, 16.077804316996474, 23.94...
                              ...                        
1712    [7.496338617675973, 7.496338617675973, 7.49633...
1713          [0, 2.59987800119642, 0, 7.179746413221072]
1714    [16.077804316996474, 28.038398044701818, 25.28...
1715    [1.8794406205241505, 10.751109273674242, 0, 6....
1716    [20.400002087964094, 0, 16.077804316996474, 10...
Name: text, Length: 1636, dtype: object

In [31]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, file_weighting.text, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.average(np.array(x.loc['sentiment_coeff'])), axis=1)
display(replacement_df.head(5))

Unnamed: 0,sentiment_coeff,sentence,sentiment,sentiment_rate
0,"[2.436371903676394, 3.999863239625364, 4.61563...",g bsa d buka ya min,0,8.9473
1,"[20.400002087964094, 7.75550262385769, 20.4000...",kuota mana kuota,0,16.185169
2,"[16.424541165590494, 20.400002087964094, 7.755...",subsidi kuota mana,0,14.860015
3,"[20.400002087964094, 7.75550262385769, 20.4000...",kuota mana kuota min,0,16.158328
4,"[20.400002087964094, 16.077804316996474, 23.94...",kuota min face with tears of joy,0,17.930971


In [32]:
replacement_df['sentiment'] = replacement_df.apply(lambda x: 1 if x.loc['sentiment_rate']>0 else 0, axis=1)

replacement_df[['sentence', 'sentiment']]

Unnamed: 0,sentence,sentiment
0,g bsa d buka ya min,1
1,kuota mana kuota,1
2,subsidi kuota mana,1
3,kuota mana kuota min,1
4,kuota min face with tears of joy,1
...,...,...
1712,fire fire fire,1
1713,backsoudnya lo fi gini,1
1714,min saya mau tny sesuatu udh dm ig mention twi...,1
1715,ayo ikut hel we are an international students ...,1


In [33]:
replacement_df.to_csv('labeled_data.csv', index=False)

### Modeling

In [34]:
cleaned_data = pd.read_csv('labeled_data.csv')
comments = cleaned_data['sentence']
comments = comments.tolist()
all_word = []
for comment in comments:
    for word in comment.split():
        all_word.append(word)

In [35]:
from collections import Counter
counts = Counter(all_word)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
print(vocab_to_int)

{'face': 1, 'heart': 2, 'pak': 3, 'with': 4, 'hands': 5, 'kuota': 6, 'di': 7, 'fire': 8, 'red': 9, 'ya': 10, 'smiling': 11, 'min': 12, 'eyes': 13, 'folded': 14, 'ga': 15, 'yg': 16, 'ini': 17, 'ada': 18, 'crying': 19, 'bisa': 20, 'up': 21, 'nya': 22, 'loudly': 23, 'thumbs': 24, 'saya': 25, 'alhamdulillah': 26, 'dan': 27, 'aja': 28, 'juga': 29, 'yang': 30, 'dong': 31, 'telkom': 32, 'buat': 33, 'nih': 34, 'mau': 35, 'untuk': 36, 'subsidi': 37, 'gimana': 38, 'mana': 39, 'kalo': 40, 'semoga': 41, 'bpp': 42, 'mantap': 43, 'of': 44, 'ke': 45, 'online': 46, 'tears': 47, 'joy': 48, 'skin': 49, 'tone': 50, 'u': 51, 'light': 52, 'kita': 53, 'jadi': 54, 'gak': 55, 'kampus': 56, 'kuliah': 57, 'mahasiswa': 58, 'pake': 59, 'keren': 60, 'tugas': 61, 'kasih': 62, 'atau': 63, 'punten': 64, 'kan': 65, 'sedih': 66, 'apa': 67, 'itu': 68, 'udah': 69, 'terima': 70, 'dari': 71, 'the': 72, 'semua': 73, 'rektor': 74, 'kapan': 75, 'jangan': 76, 'clapping': 77, 'wifi': 78, 'sama': 79, 'lagi': 80, 'kak': 81, 'teri

In [36]:
comments_ints =[]
for comment in comments:
    comments_ints.append([vocab_to_int[word] for word in comment.split()])
print(comments_ints)

[[445, 446, 447, 180, 10, 12], [6, 39, 6], [37, 6, 39], [6, 39, 6, 12], [6, 12, 1, 4, 47, 44, 48], [39, 6, 12, 1263, 1264, 1265, 127], [64, 6], [64, 6], [6, 39, 6, 17, 194, 308, 365, 119, 309, 89, 97, 72, 98, 99], [310, 180, 1266, 152, 574, 69, 366, 448, 311, 260], [64, 1267], [37, 6], [6, 748], [6, 22, 75, 10, 1, 4, 89, 13], [64, 6], [6, 39, 10, 6, 109], [17, 110, 6, 15, 66], [6, 39, 308, 365, 119], [6, 39, 1268, 120, 749, 750, 158, 23, 19, 1], [6, 205, 205, 205, 205, 205, 205, 205, 205], [25, 751, 749, 6, 1269, 206, 752, 207, 17, 261], [64, 6], [312, 575, 753, 6, 31], [6], [6, 22, 576, 22, 38, 12], [64, 6], [3, 144, 69, 90, 39], [754, 159, 78, 367, 1270, 449, 145], [39, 6, 39, 577, 577, 76, 90, 17, 1271, 153, 195, 368, 1, 4, 114, 313, 208, 89, 97, 72, 98, 99, 209, 755, 79, 450, 136, 451, 1272, 314, 120, 234, 452, 27, 314, 756, 453, 32, 209, 32, 22, 757, 315, 65, 316, 1273, 368, 89, 97, 72, 98, 99], [64, 6], [64, 6], [1274, 6, 39, 6, 369, 6, 311, 317, 210, 1275, 758, 1276], [759, 3, 6

In [37]:
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))
print()

# print tokens in first review
print('Tokenized comment: \n', comments_ints[:1])

Unique words:  3132

Tokenized comment: 
 [[445, 446, 447, 180, 10, 12]]


In [38]:
labels_split = cleaned_data['sentiment'].tolist()
labels_split = np.array(labels_split)

In [39]:
# outlier review stats
comment_lens = Counter([len(x) for x in comments_ints])
print("Zero-length reviews: {}".format(comments_ints[0]))
print("Maximum review length: {}".format(max(comment_lens)))

Zero-length reviews: [445, 446, 447, 180, 10, 12]
Maximum review length: 138


In [40]:
print('Number of reviews before removing outliers: ', len(comments_ints))

## remove any reviews/labels with zero length from the comments_ints list.

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, comment in enumerate(comments_ints) if len(comment) != 0]

# remove 0-length comment and their labels
comments_ints = [comments_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([labels_split[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(comments_ints))

Number of reviews before removing outliers:  1636
Number of reviews after removing outliers:  1636


In [41]:
def pad_features(comments_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(comments_ints), seq_length), dtype=int)

    # for each review, I grab that comment and 
    for i, row in enumerate(comments_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [42]:
# Test your implementation!

seq_length = 200

features = pad_features(comments_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(comments_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

print(features[:,:])

[[   0    0    0 ...  180   10   12]
 [   0    0    0 ...    6   39    6]
 [   0    0    0 ...   37    6   39]
 ...
 [   0    0    0 ...  190   39   10]
 [   0    0    0 ... 3130 3131 1254]
 [   0    0    0 ... 3132   12   66]]


In [43]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(1308, 200) 
Validation set: 	(164, 200) 
Test set: 		(164, 200)


In [44]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True,batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [45]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,  484,   46,   12],
        [   0,    0,    0,  ...,   62,  107,   74],
        [   0,    0,    0,  ...,    0,    0, 2533],
        ...,
        [   0,    0,    0,  ...,   52,   49,   50],
        [   0,    0,    0,  ...,   57,  240,  285],
        [   0,    0,    0,  ...,    0,  551,  552]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1], dtype=torch.int32)


In [46]:
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [47]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [48]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(3133, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [49]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [50]:
# training params

epochs = 50 

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history

        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 4/50... Step: 100... Loss: 0.004061... Val Loss: 0.168573
Epoch: 8/50... Step: 200... Loss: 0.000886... Val Loss: 0.265066
Epoch: 12/50... Step: 300... Loss: 0.000328... Val Loss: 0.243918
Epoch: 16/50... Step: 400... Loss: 0.000164... Val Loss: 0.301165
Epoch: 20/50... Step: 500... Loss: 0.000062... Val Loss: 0.361260
Epoch: 24/50... Step: 600... Loss: 0.000067... Val Loss: 0.295799
Epoch: 27/50... Step: 700... Loss: 0.000049... Val Loss: 0.333004
Epoch: 31/50... Step: 800... Loss: 0.000031... Val Loss: 0.336037
Epoch: 35/50... Step: 900... Loss: 0.000025... Val Loss: 0.400979
Epoch: 39/50... Step: 1000... Loss: 0.000023... Val Loss: 0.401865
Epoch: 43/50... Step: 1100... Loss: 0.000024... Val Loss: 0.315961
Epoch: 47/50... Step: 1200... Loss: 0.000021... Val Loss: 0.333191
Epoch: 50/50... Step: 1300... Loss: 0.000021... Val Loss: 0.335061


In [51]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.361
Test accuracy: 0.872


In [52]:
test_comment_pos = 'Bagus pak subsidi nya'

In [53]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints

# test code and generate tokenized review
test_ints = tokenize_review(test_comment_pos)
print(test_ints)

[[533, 3, 37, 22]]


In [54]:
# test sequence padding
seq_length=200
features = pad_features(test_ints, seq_length)

print(features)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 533   3
   37  22]]


In [55]:
# test conversion to tensor and pass into your model
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 200])


In [56]:
def predict(net, test_review, sequence_length=200):
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive comment detected!")
    else:
        print("Negative comment detected.")

In [57]:
# call function
seq_length=200 # good to use the length that was trained on

predict(net, test_comment_pos, seq_length)

Prediction value, pre-rounding: 0.999998
Positive comment detected!
