In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTS

In [98]:
import re
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 200)
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.callbacks import *

# LOADING FILE

In [3]:
df = pd.read_csv('drive/MyDrive/Sentiment-Detection/train.csv',encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# TEXT CLEANING

In [62]:
def cleaner(text):
  text = re.sub("https\S+ ", " ", text)
  text = BeautifulSoup(text).get_text()
  text = re.sub("[^a-zA-Z]", " ", text)
  text = text.lower()
  tokens = text.split()
  return " ".join(tokens)

In [63]:
df['cleaned_text'] = df['tweet'].apply(cleaner)

In [64]:
df.head()

Unnamed: 0,id,label,tweet,cleaned_text
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks to m...
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememories un...
3,4,0,I'm wired I know I'm George I was made that wa...,i m wired i know i m george i was made that wa...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple won t even talk to ...


# TRAIN TEST SPLIT

In [68]:
x_tr,x_val,y_tr,y_val=train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=0,shuffle=True)

# TEXT REPRESTATION

In [69]:
x_tokenizer = Tokenizer()

In [70]:
x_tokenizer.fit_on_texts(x_tr)

In [71]:
x_tokenizer.word_index

{'iphone': 1,
 'http': 2,
 'com': 3,
 'apple': 4,
 'p': 5,
 'i': 6,
 'my': 7,
 'instagram': 8,
 'the': 9,
 'to': 10,
 'a': 11,
 'samsung': 12,
 'it': 13,
 'and': 14,
 's': 15,
 'for': 16,
 'new': 17,
 'twitter': 18,
 'me': 19,
 'you': 20,
 'phone': 21,
 'is': 22,
 'am': 23,
 'sony': 24,
 'instagr': 25,
 'follow': 26,
 'in': 27,
 't': 28,
 'on': 29,
 'of': 30,
 'this': 31,
 'pic': 32,
 'with': 33,
 'https': 34,
 'ipad': 35,
 'like': 36,
 'so': 37,
 'www': 38,
 'love': 39,
 'just': 40,
 'at': 41,
 'have': 42,
 'ios': 43,
 'android': 44,
 'life': 45,
 'm': 46,
 'rt': 47,
 'now': 48,
 'that': 49,
 'all': 50,
 'your': 51,
 'day': 52,
 'an': 53,
 'can': 54,
 'ly': 55,
 'not': 56,
 'photo': 57,
 'cute': 58,
 'gain': 59,
 'get': 60,
 'galaxy': 61,
 'today': 62,
 'case': 63,
 'back': 64,
 'be': 65,
 'photography': 66,
 'news': 67,
 'got': 68,
 'fun': 69,
 'from': 70,
 'music': 71,
 'd': 72,
 'app': 73,
 'bit': 74,
 'instagood': 75,
 'out': 76,
 'happy': 77,
 'time': 78,
 'who': 79,
 'beautiful'

In [72]:
len(x_tokenizer.word_index)

18964

In [73]:
# Keep words with freq > 3
thresh = 3

cnt=0
for key,value in x_tokenizer.word_counts.items():
  if value>=thresh:
    cnt=cnt+1

print(cnt)

3824


In [74]:
x_tokenizer = Tokenizer(num_words=cnt,oov_token='unk')
x_tokenizer.fit_on_texts(x_tr)

In [75]:
max_len = 100

x_tr_seq = x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

x_tr_seq = pad_sequences(x_tr_seq,  padding='post', maxlen=max_len)
x_val_seq = pad_sequences(x_val_seq, padding='post', maxlen=max_len)

In [76]:
x_voc_size = x_tokenizer.num_words + 1
x_voc_size

3825

In [77]:
x_tr_seq[0]

array([ 105,   98,  790,   78,   34,   14,  633, 2351,    1, 1260,    5,
         35,    9,    4,    6, 1975,    1,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

# MODEL BUILDING

In [84]:
model = Sequential()
model.add(Embedding(x_voc_size, 50, input_shape=(max_len,), mask_zero=True))
model.add(SimpleRNN(128,activation='relu'))
model.add(Dense(128,activation='relu')) 
model.add(Dense(1,activation='sigmoid'))

In [85]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           191250    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               22912     
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 230,803
Trainable params: 230,803
Non-trainable params: 0
_________________________________________________________________


In [86]:
model.compile(optimizer='adam',loss='binary_crossentropy')

In [87]:
mc = ModelCheckpoint("weights.best.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [88]:
model.fit(x_tr_seq, y_tr, batch_size=128, epochs=10, verbose=1, validation_data=(x_val_seq, y_val), callbacks=[mc])

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.32992, saving model to weights.best.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.32992 to 0.27620, saving model to weights.best.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.27620
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.27620
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.27620
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.27620
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.27620
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.27620
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.27620
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.27620


<tensorflow.python.keras.callbacks.History at 0x7fb40cf41978>

In [89]:
model.load_weights("weights.best.hdf5")

In [90]:
pred_prob = model.predict(x_val_seq)

In [91]:
pred_prob[0]

array([0.00255732], dtype=float32)

In [94]:
#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
threshold

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49])

In [95]:
# convert probabilities into classes or tags based on a threshold value
def classify(pred_prob,thresh):
  y_pred = []

  for i in pred_prob:
    if i>=thresh:
      y_pred.append(1)
    else:
      y_pred.append(0)

  return y_pred

In [100]:
score=[]

for thresh in threshold:
    y_pred = classify(pred_prob,thresh) 
    score.append(metrics.f1_score(y_val,y_pred))

In [101]:
opt = threshold[score.index(max(score))]
opt

0.44

# MODEL EVALUATION

In [102]:
y_pred = classify(pred_prob,opt)

In [103]:
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      1178
           1       0.72      0.90      0.80       406

    accuracy                           0.89      1584
   macro avg       0.84      0.89      0.86      1584
weighted avg       0.90      0.89      0.89      1584



# PREDICT INDIVIDUAL

In [133]:
def predict_tag(comment):  

  text = [cleaner(comment)]

  seq = x_tokenizer.texts_to_sequences(text)

  pad_seq = pad_sequences(seq,  padding='post', maxlen=max_len)

  pred_prob = model.predict(pad_seq)[0]
  classes = classify(pred_prob,opt)[0]

  if classes == 0:
    return "HAPPY"
  else:
    return "UNHAPPY"

In [134]:
predict_tag("what a disgusting service")

'UNHAPPY'

# TEST DATA

In [135]:
df_test = pd.read_csv('drive/MyDrive/Sentiment-Detection/test.csv',encoding='latin-1')

In [136]:
df_test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' â Yes, but wouldn't that block the screen?\n"
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing"
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for


In [139]:
df_test["cleaned_text"] = df_test["tweet"].apply(cleaner)

In [141]:
seq = x_tokenizer.texts_to_sequences(df_test["cleaned_text"])

In [143]:
pad = pad_sequences(seq,  padding='post', maxlen=max_len)

In [145]:
pred_prob = model.predict(pad)

In [147]:
classes = classify(pred_prob,opt)

# SAVE DATA

In [152]:
submit = pd.DataFrame()

In [153]:
submit["id"] = df_test.id

In [154]:
submit['label'] = classes

In [156]:
submit.set_index('id')

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
7921,1
7922,0
7923,1
7924,1
7925,1
...,...
9869,0
9870,0
9871,1
9872,1


In [158]:
submit.to_csv('submit.csv', index=False)