In [1]:
!pip install kaggle



In [2]:
from IPython.display import clear_output
!pip install --upgrade kaggle
clear_output()
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


In [3]:
!kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:04<00:00, 23.5MB/s]
100% 80.9M/80.9M [00:04<00:00, 17.9MB/s]


In [4]:
import json
import tensorflow as tf
import csv
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm

In [5]:
!unzip -qx /content/sentiment140.zip

In [6]:
!head -5 /content/training.1600000.processed.noemoticon.csv

"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"
"0","1467810672","Mon Apr 06 22:19:49 PDT 2009","NO_QUERY","scotthamilton","is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"
"0","1467810917","Mon Apr 06 22:19:53 PDT 2009","NO_QUERY","mattycus","@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds"
"0","1467811184","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","ElleCTF","my whole body feels itchy and like its on fire "
"0","1467811193","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","Karoli","@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "


In [7]:
columns = ['target', 'ids', 'date', 'flag' ,'user', 'text']

In [8]:
data_path = '/content/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(data_path,names = columns , encoding='latin-1')

In [9]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
df['text']

0          @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          is upset that he can't update his Facebook by ...
2          @Kenichan I dived many times for the ball. Man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: text, Length: 1600000, dtype: object

In [11]:
sentences = list(df['text'])
df[df['target'] == 4]= 1
labels = list(df['target'])
samples_size = len(labels)
print(len(labels))

1600000


In [12]:
vocab_size = 100000
max_length = 32 # 32 words in each sentence
oov_token = "<OOV>"

In [13]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token= oov_token)
tokenizer.fit_on_texts(sentences)

In [14]:
word_index = tokenizer.word_index
index_word = tokenizer.index_word

In [15]:
word_index['father']

2148

In [16]:
sequences = tokenizer.texts_to_sequences(sentences)

In [17]:
i = 0
print(sentences[i])
print(sequences[i])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
[20683, 40, 148, 57, 1, 474, 145, 5, 1222, 8, 3660, 49, 829, 10317, 13, 1956, 31, 3, 42, 10, 386]


In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(sequences, maxlen = max_length, padding = 'post', truncating= 'post')

In [19]:
padded[0]

array([20683,    40,   148,    57,     1,   474,   145,     5,  1222,
           8,  3660,    49,   829, 10317,    13,  1956,    31,     3,
          42,    10,   386,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size = 0.2, random_state = 101)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1280000, 32)
(320000, 32)
(1280000,)
(320000,)


## LSTM Model

In [22]:
embedding_dim = 100
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim ,input_length= max_length))
model.add(LSTM(64))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 100)           10000000  
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 10,046,465
Trainable params: 10,046,465
Non-trainable params: 0
_________________________________________________________________


In [24]:
num_epochs = 5
history = model.fit(X_train, y_train,
                    epochs = num_epochs, validation_data = [X_test,y_test],
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.evaluate(X_test, y_test)



[0.4285949170589447, 0.8193968534469604]

In [None]:
text1 = 'I hate my life and you and everybody else, just kill me'
text2 = 'I love everything, I could not be happier'

In [None]:
seq1 = tokenizer.texts_to_sequences([text1])
pad1 = pad_sequences(seq1, maxlen = max_length, padding = 'post', truncating= 'post')
seq2 = tokenizer.texts_to_sequences([text2])
pad2 = pad_sequences(seq2, maxlen = max_length, padding = 'post', truncating= 'post')

print(model.predict(pad1))
print(model.predict(pad2))

[[0.02769829]]
[[0.9891567]]


In [27]:
embedding_dim = 100
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim ,input_length= max_length))
model.add(SimpleRNN(64))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

num_epochs = 1
history = model.fit(X_train, y_train,
                    epochs = num_epochs, validation_data = [X_test,y_test],
                    verbose = 1)



all the things is clear about diffrent between SimpleRnn and LSTM about time,accuracy and loss function in the first epoch