# SENTIMEN ANALYSIS 
Tugas Azhar Rafiq

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import LabelEncoder
from keras.optimizers import RMSprop

from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers import Flatten, GlobalMaxPooling1D

## READING DATASET

In [6]:
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_row', None)
cols_to_load = ['sentimen','tweet']
texttoclean = pd.read_csv('./Dataset/tweet.csv', usecols=cols_to_load)
sentiments = texttoclean.sentimen
tweets = texttoclean.tweet

# print(texttoclean.describe())
# print(texttoclean)

## PREPROCESSING

In [7]:
def clean_tag(isiteks):
    #Remove Tags
    return isiteks.str.replace('@','')

def remove_anything(isivar):
    #Remove URLs
    tanpaURL = isivar.apply(lambda x: re.sub(r'http\S+|www.\S+', '', x, flags=re.MULTILINE))

    #Remove Picture's Links
    tanpaURLdanGambar = tanpaURL.apply(lambda x: re.sub(r'pic.twitter.com/\S+', '', x))

    #Remove Hastag
    tanpaURLdanGambardanHashtag = tanpaURLdanGambar.apply(lambda x: re.sub(r'#\S+', '', x))

    #Remove Alphanumeric
    tanpaURLdanGambardanHashtagdanAlphaNumeric = tanpaURLdanGambardanHashtag.apply(lambda x: re.sub('[^a-zA-Z0-9 ]', '', x))

    #Return the dataset with lowercase
    return tanpaURLdanGambardanHashtagdanAlphaNumeric.str.lower()

tweetsproc1 = clean_tag(tweets)
tweetsproc2 = remove_anything(tweetsproc1)

#Checking...
print(tweetsproc2.describe())
print(tweetsproc2.head())

count                                                                                                                        1815
unique                                                                                                                       1814
top       the great closing statement from the next indonesian leaders prabowo  sandiuno  kami berkomitmen berdua untuk tidak ...
freq                                                                                                                            2
Name: tweet, dtype: object
0            kata prabowo indonesia tidak dihargai bangsa asing   berita ini  pasti hoax buatan penguasa ya kan rockygerung 
1                                                       batuan langka tasbih jokowi hadiah dari habib luthfi seharga mercy  
2                                                                           di era jokowi ekonomi indonesia semakin baik    
3    bagi sumatera selatan asian games berdampak pd ekonomi langsung diprediks

## Stopwords

In [8]:
def filter_short_words(isivar):
    #filter words less than or equal to 3 words
    return isivar.apply(lambda x: ' '.join([word for word in x.split() if (len(word) <= 3 and word.isalpha())]))

kata_pendek = filter_short_words(tweetsproc2)

print(kata_pendek)
print(kata_pendek.info())

0                                                                                   ini ya kan
1                                                                                             
2                                                                                       di era
3                                                                                           pd
4                                                                                   itu itu ya
5                                                         yg pak cm di dr hal yg dgn bhs yg tp
6                                                                                             
7                                                                                       di tps
8                                                  iya aa kan ke yg bs yg sm mas tmn yg plg yg
9                                                                                          tak
10                                                

## MERGE SENTIMEN AND TWEET COLUMNS

In [9]:
tweets_merged = pd.concat([sentiments, tweetsproc2], axis=1)
tweets_clean = tweets_merged.drop_duplicates(subset='tweet')
print(tweets_clean.head())
print(tweets_clean.info())

  sentimen  \
0  negatif   
1   netral   
2   netral   
3  positif   
4  negatif   

                                                                                                                     tweet  
0          kata prabowo indonesia tidak dihargai bangsa asing   berita ini  pasti hoax buatan penguasa ya kan rockygerung   
1                                                     batuan langka tasbih jokowi hadiah dari habib luthfi seharga mercy    
2                                                                         di era jokowi ekonomi indonesia semakin baik      
3  bagi sumatera selatan asian games berdampak pd ekonomi langsung diprediksi mencapai 185 triliun indonesia maju jokow...  
4  negara kita ngutang buat bngun infrastruktur yang udah dipake masyarakat terus masyarakatnya ngeluh karena negara ng...  
<class 'pandas.core.frame.DataFrame'>
Index: 1814 entries, 0 to 1814
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    -----------

In [10]:
df = tweets_clean
x = df.tweet
y = df.sentimen

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = y.reshape(-1,1)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3)



In [11]:
#conversi teks ke sequence
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

In [12]:
# define RNN
def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 50, input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [13]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding (Embedding)       (None, 150, 50)           50000     
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257   