In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import matplotlib.pyplot as plt
import string
import pandas as pd
import numpy as np
import csv

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.tokenize.treebank import TreebankWordDetokenizer

from gensim.utils import simple_preprocess

import tensorflow as tf
import keras

import os.path
from os import path

In [None]:
from tensorflow.python.client import device_lib

tf.config.list_physical_devices('GPU')
device_lib.list_local_devices()

# Load dataset & See stats

In [None]:
dataset = pd.read_csv("../data/raw/datasetBalanced.csv") \
            .drop_duplicates(subset='text', keep="last")[['text', 'sentiment']]
# Remove duplicates & keep columns to use

dataset.head(5)

In [None]:
# Get the dataset lenght
len(dataset)

In [None]:
# Validate if there are different values from negative (0), neutral (1) and positive (2)
dataset['sentiment'].unique()

In [None]:
# How distributed is the dataset
dataset.groupby('sentiment').nunique()

# Data cleaning

Even when the dataset is a little bit biased, we'll keep it this way because the differences are not significant.

In [None]:
# Fill null values.
dataset["text"].fillna("No content", inplace=True)

### The next steps about data cleaning will be:

* Remove URLs from the tweets
* Tokenize text
* Remove emails
* Remove new lines characters
* Remove distracting single quotes
* Remove all punctuation signs
* Lowercase all text
* Detokenize text
* Convert list of texts to Numpy array

In [None]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
TAG_RE = re.compile(r'<[^>]+>')
EMOJI_RE = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)

def depure_data(data):
    # Removing URLs with a regular expression
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)

     #Remove @ sign
    data = re.sub("@[A-Za-z0-9]+","",data)

     #Remove http links
    data = re.sub(r"(?:@|http?://|https?://|www)\S+", "", data)

    #Remove Emojis
    data = EMOJI_RE.sub('', data)

    #Remove hashtag sign but keep the text
    data = re.sub("#[A-Za-z0-9]+","",data)

    # Remove html tags
    data = TAG_RE.sub('', data);

    return data

In [None]:
list_words = stopwords.words('english')

tweets = dataset['text'].values.tolist()
tweets_len = len(tweets)

remove_stopwords = lambda data: (" ").join([word for word in data.split() if not word in list_words]) # Remove unused words like will
processor = lambda sentence: simple_preprocess(str(sentence), deacc=True) # Lowecase, ponctuation & accents
detokenizer = lambda sequence: TreebankWordDetokenizer().detokenize(sequence) # Join sequence of tokens

data = list([detokenizer(processor(remove_stopwords(depure_data(tweets[i])))) for i in range(tweets_len)])

print(data[:5])

In [None]:
data = np.array(data)

# Save cleaned data for performance purposes
pd.DataFrame(data).to_csv('../data/pre_processing/pre_processed_tweets.csv')

# Label encoding

As the dataset is categorical, we need to convert the sentiment labels from Neutral, Negative and Positive to a float type that our model can understand. To achieve this task, we'll implement the to_categorical method from Keras.

In [None]:
labels = tf.keras.utils.to_categorical(dataset['sentiment'], 3, dtype="float32")
print(labels)

# Data sequencing and splitting

We'll implement the Keras tokenizer as well as its pad_sequences method to transform our text data into 3D float data, otherwise our neural networks won't be able to be trained on it.

In [None]:
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split

In [None]:
max_words = 5000
max_len = 200

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(tweets,labels, test_size=0.20, random_state=42)

print (len(X_train), len(X_test), len(y_train), len(y_test))

# Model building

Alright, in the next cells I'll guide you through the process of building 3 Recurrent Neural Networks. I'll implement sequential models from the Keras API to achieve this task. Essentially, I'll start with a single layer **LSTM** network which is known by achieving good results in NLP tasks when the dataset is relatively small (I could have started with a SimpleRNN which is even simpler, but to be honest it's actually not deployed in production environments because it is too simple - however I'll leave it commented in case you want to know it's built). The next one will be a Bidirectional LSTM model, a more complex one and this particular one is known to achieve great metrics when talking about text classification. To go beyond the classic NLP approach, finally we'll implement a very unusual model: a Convolutional 1D network, known as well by delivering good metrics when talking about NLP. If everything goes ok, we should get the best results with the BidRNN, let's see what happens.

Let's get hands on:

## LSTM layer model

In [None]:
model = Sequential()

model.add(layers.Embedding(max_words, 20))
model.add(layers.Bidirectional(layers.LSTM(10, dropout=0.5, return_sequences=True)))
model.add(layers.LSTM(units=10, dropout=0.5))
model.add(layers.Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("../models/0_bi_lstm_tweet_sentiment.hdf5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto', save_weights_only=False)

batch_size=100 # 688361 samples / batch_size = Number of iterations per epoch
history = model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test), callbacks=[checkpoint1], batch_size=batch_size)

# Best model validation


In [None]:
# Load the best model obtained during training
model = keras.models.load_model("../models/0_bi_lstm_tweet_sentiment.hdf5")

test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)

print('Model accuracy: ',test_acc)

In [None]:
predictions = model.predict(X_test)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

## Examples


In [None]:
sentiment = ['Negative', 'Neutral', 'Positive']

In [None]:
sequence = tokenizer.texts_to_sequences(['this experience has been the best, want my money back'])

test = pad_sequences(sequence, maxlen=max_len)

sentiment[np.around(model.predict(test), decimals=0).argmax(axis=1)[0]]

In [None]:
sequence = tokenizer.texts_to_sequences(['this data science article is the best ever'])

test = pad_sequences(sequence, maxlen=max_len)

sentiment[np.around(model.predict(test), decimals=0).argmax(axis=1)[0]]

In [None]:
sequence = tokenizer.texts_to_sequences(['i hate youtube ads, they are fantastic'])

test = pad_sequences(sequence, maxlen=max_len)

sentiment[np.around(model.predict(test), decimals=0).argmax(axis=1)[0]]

In [None]:
sequence = tokenizer.texts_to_sequences(['i really how the technician helped me with the issue that i had'])

test = pad_sequences(sequence, maxlen=max_len)

np.around(model.predict(test), decimals=0).argmax(axis=1)[0]

In [None]:
import glob

In [None]:
files = glob.glob('../../bitcoin_prediction/data/year_2022/month_1/*/*.csv', recursive=True)

In [None]:
print(files)

In [None]:
def prepare_prediction(text):
    clean = detokenizer(processor(remove_stopwords(depure_data(text))))

    sequence = tokenizer.texts_to_sequences([clean])

    text = pad_sequences(sequence, maxlen=max_len)

    return np.around(model.predict(text), decimals=0).argmax(axis=1)[0]

In [None]:
def execution(file):
    print("Start file \"{}\"".format(file))

    for chunk in pd.read_csv(file, lineterminator='\n', chunksize=25)
        chunk["Sentiment"] = -1
        
        for i, chunk_data in chunk.iterrows():
            print(i)
            chunk["Sentiment"][i] = prepare_prediction(chunk["text"][i])

    dataset.to_csv(file, index=False)
    print("End file \"{}\"".format(file))

In [None]:
import time
from multiprocessing.pool import ThreadPool

start = time.perf_counter()

pool = ThreadPool(processes=32)

pool.map(execution, files)
pool.join()
pool.close()
    
finish = time.perf_counter()

print("Finished in {} seconds".format(finish-start))

In [None]:
dataset.head()