In [None]:
#References
#https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
#https://pasaentuciudad.com.mx/data-to-model-to-api-an-end-to-end-approach/
#https://github.com/The-AI-Summer/Deep-Learning-In-Production/tree/master/2.%20Writing%20Deep%20Learning%20code:%20Best%20Practises

# Install and import the necessary dependencies

In [None]:

from google.colab import drive

#This will prompt for authorization.
drive.mount('/content/drive')

#Create a symbolic link, in order no to be able to save weights on drive. Otherwise it gives an error. 
!ln -s /content/drive/My\ Drive /content/mydrive


#Install requirements
%cd mydrive/SentimentAnalysis
!pip install -r requirements.txt 


import os
import sys
import requests, zipfile, io

import pandas as pd
# import re
# import wordninja, contractions, emoji

import seaborn as sns #visualization.py
import nltk #visualization.py
from nltk.corpus import stopwords #visualization.py

import tensorflow as tf

from sklearn.model_selection import train_test_split

sys.path.insert(0,'/content/mydrive/SentimentAnalysis/src/utils')
import Config, preprocess

sys.path.insert(0,'/content/mydrive/SentimentAnalysis/src/data')
import dataloader

sys.path.insert(0,'/content/mydrive/SentimentAnalysis/src/models/')
import BaseModel,LSTM

sys.path.insert(0,'/content/mydrive/SentimentAnalysis/configs')
from config_main import CFG


# Load data

In [None]:

input_path = os.path.join('/content/mydrive/SentimentAnalysis/datasets', CFG['data']['name'])

#Download and unzip dataset
if not (os.path.isfile(CFG['data']['url'].split('/')[-1])):
  dataloader.download_dataset_from_url(CFG['data']['url'])
dataloader.unzip_data_to_flder(input_path, CFG['data']['url'].split('/')[-1])

#Read unzipped data
train_data_df = pd.read_csv(os.path.join(input_path, 'training.1600000.processed.noemoticon.csv'),  encoding='latin-1', usecols=[0,5], names=['sentiment','tweet'])

#Print dataset info
print(train_data_df.info())  


# Preprocess data

In [None]:

# Preprocessing tweets data
print("Cleaning and parsing the tweets...\n")
train_data_df = train_data_df.iloc[0:10000] #TO-DO: Remove
train_data_df.tweet = train_data_df.tweet.apply(datacleaner.preprocess_tweet)
print("Finished!\n")

# Preprocessing labels to have classes 0 and 1
train_data_df.sentiment = train_data_df.sentiment.apply(lambda value: 1 if value==4 
                                                        else value)

print("Sentiment values: ")
print(train_data_df.sentiment.value_counts())

print(train_data_df.head(5)) 



# Analyze data
Estimating the vocabulary size and the vector sequence length to be fed to the model every instance is a crucial step for a good model.

This is achieved by analyzing the training dataset, by plotting the 
distribution of tweet lengths across the training data.


In [None]:
"""
Return the length of a tweet.
Input: tweet(str)
Output: length(int)
"""
def tweet_length(tweet):
  return len([token for token in tweet.split()])

tweet_lengths = [tweet_length(tweet) for tweet in train_data_df.tweet.tolist()]
sns.distplot(tweet_lengths)

# Unique words
unique_words = set([token for tweet in train_data_df.tweet for token in tweet.split()])
print("Total Unique Words:", len(unique_words))

# Counting Total Words and Stop Words
nltk.download("stopwords")
stop_words = stopwords.words("english")
total_words = [token for tweet in train_data_df.tweet for token in tweet.split()]
total_stop_words = [token for tweet in train_data_df.tweet for token in tweet.split() if token in stop_words]
print('Total words', len(total_words))
print('Total stop words', len(total_stop_words))
print('Ratio of total words to total stop words:', len(total_words)/len(total_stop_words))

# Train model


In [None]:
from importlib import reload  # Py3 only; unneeded in py2.
foo = reload(LSTM)

In [None]:
model = LSTM.LSTM(CFG)
model.split_training_data(train_data_df)
model.data_vectorization(75000, 50)
model.build_model()
model.train()

[1]
<keras.layers.preprocessing.text_vectorization.TextVectorization object at 0x7eff3c4d8b10>
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 50)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 50, 128)           9600128   
                                                                 
 LSTM_1 (LSTM)               (None, 50, 256)           394240    
                                                                 
 LSTM_2 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_2 (Dropout)         (

([0.35688239336013794, 3.422956433496438e-05, 2.689753841877973e-07],
 [0.00012949264782946557, 5.164381491340464e-07, 4.3917900427459244e-08])

# Save/Serialize model

In [None]:
# Saving Model
MODEL_DIR = "/content/mydrive/SentimentAnalysis/models"
version = 1
export_path = os.path.join(MODEL_DIR, str(version))
print('export_path = {}\n'.format(export_path))

tf.keras.models.save_model(
    model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

# Check the path
print('\nSaved model:')
!ls -l {export_path}

# Using SavedModelCLI to check if model is persisted properly
!saved_model_cli show --dir {export_path} --all

In [None]:
model.evaluate() #TO-DO

# Evaluate model

In [None]:
# Loading and Evaluation of Model

test_data_df = pd.read_csv(os.path.join(input_path, 'testdata.manual.2009.06.14.csv'),  encoding='latin-1', usecols=[0,5], names=['sentiment','tweet'])
print(test_data_df.sentiment.value_counts()/test_data_df.shape[0])

# Preprocessing tweets data
print("Cleaning and parsing the tweets...\n")
test_data_df = test_data_df.iloc[0:1000] #TO-DO: Remove
test_data_df.tweet = test_data_df.tweet.apply(preprocess_tweet) #TO-DO
print("Finished!\n")

test_data_df = test_data_df[test_data_df.sentiment!=2] #Remove intermediate polarities
test_data_df.sentiment.value_counts()/test_data_df.shape[0]

test_data_df.sentiment = test_data_df.sentiment.apply(lambda value: 1 if value==4 else value)

print(test_data_df.sentiment.value_counts()/test_data_df.shape[0])


# Preprocessing labels to have classes 0 and 1
test_data_df.sentiment = test_data_df.sentiment.apply(lambda value: 1 if value==4 else value)

#Create data pipeline for test
test_dataset = create_data_pipeline(test_data_df.tweet, test_data_df.sentiment, batch_size=128, is_training=False)


model = tf.keras.models.load_model(export_path)
model.evaluate(test_dataset)