In [1]:
#References
#https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
#https://pasaentuciudad.com.mx/data-to-model-to-api-an-end-to-end-approach/

# Install and import the necessary dependencies

In [2]:
#Dependencies
!pip install wordninja contractions emoji

import os
import requests, zipfile, io
from google.colab import drive

import pandas as pd
import re

import wordninja, contractions, emoji #preprocessing.py

import seaborn as sns #visualization.py
import nltk #visualization.py
from nltk.corpus import stopwords #visualization.py

import tensorflow as tf

from sklearn.model_selection import train_test_split


#This will prompt for authorization.
drive.mount('/content/drive')

#Create a symbolic link, in order no to be able to save weights on drive. Otherwise it gives an error. 
!ln -s /content/drive/My\ Drive /content/mydrive


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ln: failed to create symbolic link '/content/mydrive/My Drive': Input/output error


In [3]:
#Configuration file

# dataset_name = 'Sentiment140'
# input_path = os.path.join('/content/mydrive', dataset_name)
# dataset_url = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'


# Load data

In [16]:
#Download and unzip dataset
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
!mkdir '/content/mydrive/SentimentAnalysis'
!mkdir '/content/mydrive/SentimentAnalysis/Input'
!unzip trainingandtestdata.zip -d '/content/mydrive/SentimentAnalysis/Input'

input_path = '/content/mydrive/SentimentAnalysis/Input'

#Read unzipped data
train_data_df = pd.read_csv(os.path.join(input_path, 'training.1600000.processed.noemoticon.csv'),  encoding='latin-1', usecols=[0,5], names=['sentiment','tweet'])

#Print dataset info
print(train_data_df.info())  


URL transformed to HTTPS due to an HSTS policy
--2022-04-03 18:42:17--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip.4’


2022-04-03 18:42:19 (47.1 MB/s) - ‘trainingandtestdata.zip.4’ saved [81363704/81363704]

mkdir: cannot create directory ‘/content/mydrive/SentimentAnalysis’: File exists
mkdir: cannot create directory ‘/content/mydrive/SentimentAnalysis/Input’: File exists
Archive:  trainingandtestdata.zip
replace /content/mydrive/SentimentAnalysis/Input/testdata.manual.2009.06.14.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: <class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------   

# Preprocess data

In [None]:
def strip_emoji(tweet):
  new_tweet = re.sub(emoji.get_emoji_regexp(), r"", tweet)
  return new_tweet.strip()

def strip_urls(tweet):
  new_tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', tweet, flags=re.MULTILINE)
  return new_tweet.strip()

def remove_tags(tweet):
  return " ".join([token for token in tweet.split() if not token.startswith("@")])

def preprocess_tweet(tweet):
  tweet = remove_tags(strip_emoji(strip_urls(tweet)))
  tweet = contractions.fix(" ".join(wordninja.split(tweet)))
  tweet = [token.lower() for token in tweet.split() if (len(set(token))>1)]
  return " ".join(tweet)

# Preprocessing tweets data
print("Cleaning and parsing the tweets...\n")
#train_data_df = train_data_df.iloc[0:10000] #TO-DO: Remove
train_data_df.tweet = train_data_df.tweet.apply(preprocess_tweet)
print("Finished!\n")

# Preprocessing labels to have classes 0 and 1
train_data_df.sentiment = train_data_df.sentiment.apply(lambda value: 1 if value==4 else value)

print("Sentiment values: ")
print(train_data_df.sentiment.value_counts())

print(train_data_df.head(5)) 



Cleaning and parsing the tweets...



  


# Analyze data
Estimating the vocabulary size and the vector sequence length to be fed to the model every instance is a crucial step for a good model.

This is achieved by analyzing the training dataset, by plotting the 
distribution of tweet lengths across the training data.


In [None]:
# Estimating vocab size and max sequence length to allow in vectorization layer.
def tweet_length(tweet):
  return len([token for token in tweet.split()])

tweet_lengths = [tweet_length(tweet) for tweet in train_data_df.tweet.tolist()]
sns.distplot(tweet_lengths)

# Unique words
unique_words = set([token for tweet in train_data_df.tweet for token in tweet.split()])
print("Total Unique Words:", len(unique_words))

# Counting Total Words and Stop Words
nltk.download("stopwords")
stop_words = stopwords.words("english")
total_words = [token for tweet in train_data_df.tweet for token in tweet.split()]
total_stop_words = [token for tweet in train_data_df.tweet for token in tweet.split() if token in stop_words]
print('Total words', len(total_words))
print('Total stop words', len(total_stop_words))
print('Ratio of total words to total stop words:', len(total_words)/len(total_stop_words))

# Building a Data Pipeline

In [None]:
"""
Data Pipeline Function using TF Dataset API
""" 
def data_input_fn(texts, labels, batch_size=32, is_training=True):
  # Convert the inputs to a Dataset.
  dataset = tf.data.Dataset.from_tensor_slices((texts,labels))
  # Shuffle, repeat, and batch the examples.
  dataset = dataset.cache()
  if is_training:
    dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
    dataset = dataset.repeat()
  dataset = dataset.batch(batch_size, drop_remainder=True)
  # Return the dataset.
  return dataset

#Split training dataset into train and validation sets
train_df, val_df = train_test_split(train_data_df, test_size=0.2)

# Data pipelines for 2 different datasets
training_dataset = data_input_fn(train_df.tweet, train_df.sentiment, batch_size=1024)
validation_dataset = data_input_fn(val_df.tweet, val_df.sentiment, batch_size=128, is_training=False)


# Train model
### Text Vectorization Layer
Before feeding the text to the model, it is common practice to vectorize it first.

This can be achieved through the Text Vectorization API from tf.keras.

In [None]:
# Creating Vectorization Layer
max_features = 75000
max_len = 50

vectorization_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=max_features, output_sequence_length=max_len)
vectorization_layer.adapt(train_df.tweet.values)

### Design

In [None]:
# Create Model Func
def create_model():
  words = tf.keras.Input(shape=(1,), dtype=tf.string)
  vectors = vectorization_layer(words)
  embeddings = tf.keras.layers.Embedding(input_dim=max_features+1, output_dim=128)(vectors)
  output = tf.keras.layers.LSTM(256, return_sequences=True, name='LSTM_1')(embeddings)
  output = tf.keras.layers.LSTM(256, name='LSTM_2')(output)
  output = tf.keras.layers.Dropout(0.3)(output)
  output = tf.keras.layers.Dense(64, activation='relu', name='Dense_3')(output)
  output = tf.keras.layers.Dense(1,activation='sigmoid', name='Output')(output)

  model = tf.keras.models.Model(words,output)
  return model

In [None]:
batch_size = 1024
epochs = 3
steps_per_epoch = train_df.tweet.shape[0] // batch_size
model = create_model()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fitting the model
model.fit(training_dataset, epochs=epochs, batch_size=batch_size, 
          steps_per_epoch=steps_per_epoch, validation_data=validation_dataset)

In [None]:
# Saving Model
import os
MODEL_DIR = "/content/mydrive/SentimentAnalysis/Output"
version = 1
export_path = os.path.join(MODEL_DIR, str(version))
print('export_path = {}\n'.format(export_path))

tf.keras.models.save_model(
    model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format='h5',
    signatures=None,
    options=None
)

# Check the path
print('\nSaved model:')
!ls -l {export_path}

# Using SavedModelCLI to check if model is persisted properly
!saved_model_cli show --dir {export_path} --all

In [None]:
# Loading and Evaluation of Model

test_data_df = pd.read_csv(os.path.join(input_path, 'testdata.manual.2009.06.14.csv'),  encoding='latin-1', usecols=[0,5], names=['sentiment','tweet'])
print(test_data_df.sentiment.value_counts()/test_data_df.shape[0])

# Preprocessing tweets data
print("Cleaning and parsing the tweets...\n")
test_data_df = test_data_df.iloc[0:1000] #TO-DO: Remove
test_data_df.tweet = test_data_df.tweet.apply(preprocess_tweet) #TO-DO
print("Finished!\n")

test_data_df = test_data_df[test_data_df.sentiment!=2] #Remove intermediate polarities
test_data_df.sentiment.value_counts()/test_data_df.shape[0]

test_data_df.sentiment = test_data_df.sentiment.apply(lambda value: 1 if value==4 else value)

print(test_data_df.sentiment.value_counts()/test_data_df.shape[0])


# Preprocessing labels to have classes 0 and 1
test_data_df.sentiment = test_data_df.sentiment.apply(lambda value: 1 if value==4 else value)

#Create data pipeline for test
test_dataset = data_input_fn(test_data_df.tweet, test_data_df.sentiment, batch_size=128, is_training=False)


model = tf.keras.models.load_model(export_path)
model.evaluate(test_dataset)