Delta Task 2 Project WIP

In [None]:
!pip install wordninja

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import sklearn
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import wordninja

In [None]:
#Reading train and test sets into data frame

btc_tweets_train = pd.read_parquet('btc_tweets_train.parquet.gzip')
btc_tweets_test = pd.read_parquet('btc_tweets_test.parquet.gzip')

### Exploratory Analysis to Understand the Data

First start with the exploratory data analysis to understand the data structure. Both training and test data has 5 columns and 1500 observations in train and 500 in test set.

In [None]:
btc_tweets_train.info()
btc_tweets_test.info()

Content column has the content for sentiment analysis. The tweets include hashtags, emojis and line break elements, which may require handling before we start on tokenizing our data. And it is a mix of uppercase and lower case values

In [None]:
btc_tweets_train['content'][1641861088483368964]

The data frame has an index column and the indexes for observations are the tweet_ID

In [None]:
btc_tweets_train.head(10)

The sentiment of the content is already a boolean so dont need to change that. The train set consists heavily of positive sentiment score

In [None]:
btc_tweets_train.describe()

### Preprocessing Function

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# Creating a POS Tag (Part of Speech tagging) to use in lemmatization

def get_wordnet_pos(word:str)->str:
    """Map POS tag to first character for lemmatization

    Returns:
    --------
    pos: str
        The positional tag of speech retrieved from wordnet database.
    """

    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    pos=tag_dict.get(tag,wordnet.NOUN)

    return pos

In [None]:
def NLP_preprocessing_pipeline(sample:str)->str:
    '''
    Implements a NLP preprocessing pipeline specific for tweets from twitter.

    Parameters:
    -----------
    sample:str
        The input text that requires preprocessing

    Returns:
    --------
    preprocessed_textual_sample:str
        The cleaned version of the tweet as a string after preprocessing steps have been applied.

    '''

    #Preprocessing steps:

    #Removing of URLs:
    preprocessed_textual_sample = re.sub("http\S+"," ",sample)

    #Changing all tokens to lower case
    preprocessed_textual_sample = preprocessed_textual_sample.lower()

    #Removing the line breaks:
    preprocessed_textual_sample = preprocessed_textual_sample.replace('\n'," ")

    #Removing hashtags while replacing the rest of the hastag with the splitted version of the words in hashtag
    hashtags = re.findall(r'#\w+', preprocessed_textual_sample)
    for tag in hashtags:
      words_in_hashtag = wordninja.split(tag[1:])
      preprocessed_textual_sample = preprocessed_textual_sample.replace(tag, ' '.join(words_in_hashtag))

    #Removing of non-alphabetic characters except numbers:
    preprocessed_textual_sample = re.sub(r"[^a-zA-Z0-9,.]", " ", preprocessed_textual_sample) #keeping ',' and '.' to not split numbers like 100,000

    #Tokenization:
    preprocessed_textual_sample = nltk.word_tokenize(preprocessed_textual_sample)

    #Stopwords removal:
    words_without_stopwords = []

    for w in preprocessed_textual_sample:
      if w not in stopwords.words("english"):
        words_without_stopwords.append(w)

    #Now removing '.' and ',' from tokens
    words_without_stopwords = [words for words in words_without_stopwords if words != '.']
    words_without_stopwords = [words for words in words_without_stopwords if words != ',']

    preprocessed_textual_sample = words_without_stopwords

    # Lemmatize with POS Tag (Parts of Speech tagging)
    lemmatizer = WordNetLemmatizer()
    preprocessed_textual_sample = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in preprocessed_textual_sample]

    #Join the tokens to create a cleaned version of strings to use in Keras Tokenizer
    preprocessed_textual_sample = ' '.join(preprocessed_textual_sample)

    return preprocessed_textual_sample


In [None]:
#Creating the preprocessed train and test sets with the preprocessing function
x_tweets_clean_train =  btc_tweets_train['content'].apply(NLP_preprocessing_pipeline)
x_tweets_clean_test =  btc_tweets_test['content'].apply(NLP_preprocessing_pipeline)

#Creating a df for the sentiment labels of train and test sets
y_tweets_train = btc_tweets_train['sentiment']
y_tweets_test = btc_tweets_test['sentiment']

In [None]:
x_tweets_clean_test

### Dictionary Based Sentiment Analysis

In [None]:
!pip install pysentiment2

In [None]:
import pysentiment2 as ps
dictionary = ps.HIV4()

#Creating a function for sentiment score assesment using a dictionary
def get_sentiment_score(text):
  score = round(dictionary.get_score(dictionary.tokenize(text))['Polarity'], 2)
  return score

#Assesing polarity scores using sentiment dictionary
dc_test_sentiment_scores = x_tweets_clean_test.apply(get_sentiment_score)


In [None]:
dc_test_sentiment_scores.hist()

### Function for Assesing Sentiment Classifier

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

def assess_sentiment_classifier(y_test, y_pred_prob, cut_off=0.5, plot_roc=True):
    """
        Function to assess the classification results from the model.
        Calculates accuracy score, roc auc score and confusion matrix.
        Plots a roc curve when true.

    """
    # Calculate discrete class predictions
    y_pred_discrete = np.where(y_pred_prob>cut_off, 1, 0)

    # Calculate classification accuracy and AUC
    acc = round(accuracy_score(y_test, y_pred_discrete),4) #accurately predicted ones: TP+TN/all sample
    auc = round(roc_auc_score(y_test, y_pred_prob),4) #TP/FP

    # Confusion matrix
    cmat = confusion_matrix(y_test, y_pred_discrete)

    # ROC analysis
    if plot_roc==True:
        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        plt.plot(fpr,tpr, label="AUC={:.4}".format(auc));
        plt.plot([0, 1], [0, 1], "r--")
        plt.ylabel('True positive rate')
        plt.xlabel('False positive rate')
        plt.legend(loc='lower right')
        plt.show();

    return(auc, acc, cmat)

### Assessing Dictionary Based Sentiment Classifier

In [None]:
#We make the cut_off=0 because the dictionary returns score in range (-1,1) where score<0 is negative while score>0 is positive
assess_sentiment_classifier(y_tweets_test, dc_test_sentiment_scores, cut_off = 0,plot_roc=True)

In [None]:
auc_dict, acc_dict, _ = assess_sentiment_classifier(y_tweets_test, dc_test_sentiment_scores, cut_off = 0,plot_roc=True)

In [None]:
#Creating an empty data frame for results
df_scores = pd.DataFrame(index=['ACC', 'AUC'])
df_scores['Dictionary'] = [acc_dict,auc_dict]

### Document Level Sentiment Analysis Using TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

def dummy_fun(content):
    return content

tfidf_vectorizer = TfidfVectorizer(
    analyzer = 'word',
    tokenizer = nltk.word_tokenize,
    preprocessor = dummy_fun,
    token_pattern = None)

#Feature Extraction Using TFIDF
x_tweets_clean_train_tfidf = tfidf_vectorizer.fit_transform(x_tweets_clean_train) #fit training data to create the vocabulary and transform
x_tweets_clean_test_tfidf = tfidf_vectorizer.transform(x_tweets_clean_test) #transform test data

#Fitting the training data into a logistic regression function
logit = LogisticRegression()
logit.fit(x_tweets_clean_train_tfidf,y_tweets_train)
tfidf_test_prob = logit.predict_proba(x_tweets_clean_test_tfidf)[:,1] #Probability of being positive
tfidf_test_discrete = np.where(tfidf_test_prob>0.5, 1, 0)

In [None]:
assess_sentiment_classifier(y_test=y_tweets_test,
                            y_pred_prob = tfidf_test_prob)

In [None]:
auc_tfidf, acc_tfidf, _ = assess_sentiment_classifier(y_test=y_tweets_test,y_pred_prob = tfidf_test_prob)

In [None]:
df_scores['TFIDF'] = [acc_tfidf,auc_tfidf]
df_scores

### Processing Train & Test Sets to use in RNN

In [None]:
#Building vocabulary using Keras Tokenizer:
from tensorflow.keras.preprocessing.text import Tokenizer
vocab_word_number = 6000

# Create tokenizer object
tok = Tokenizer(vocab_word_number, oov_token=1)

# We fit the tokenizer to build vocabulary from the training set tweets
tok.fit_on_texts(x_tweets_clean_train)

In [None]:
#Number of unique words in the vocab
len(tok.word_counts)

In [None]:
# Convert training set tweets to sequences of integer values
x_tweets_clean_train_int = tok.texts_to_sequences(x_tweets_clean_train)

# Determine the maximum review length in the training set
max_review_length = max([len(review) for review in x_tweets_clean_train_int])
print('The longest tweet of the training set has {} words.'.format(max_review_length))

In [None]:
#Padding to create equal length tweets to ensure a consistent sequence length
from keras.preprocessing.sequence import pad_sequences

x_tweets_train_pad = pad_sequences(x_tweets_clean_train_int, max_review_length)

In [None]:
# Encoding and padding the test data set
x_tweets_clean_test_int = tok.texts_to_sequences(x_tweets_clean_test)
x_tweets_test_pad = pad_sequences(x_tweets_clean_test_int, max_review_length)

### RNN Language Classifier - Using Own Embeddings

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, Flatten, Dropout, LSTM, Bidirectional
from keras.initializers import Constant

# Create an embedding layer
number_of_words = vocab_word_number
embedding_dim = 30
emb_layer = Embedding(input_dim = number_of_words,
                      output_dim= embedding_dim,
                      input_length= max_review_length)

In [None]:
# Bidirectional GRU text classifier
number_hidden_nodes = 20    # number of hidden nodes

gru = Sequential()
gru.add(emb_layer)
gru.add(GRU(number_hidden_nodes))
gru.add(Dropout(0.2))
gru.add(Dense(1, activation = 'sigmoid'))

gru.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
print(gru.summary())

In [None]:
# fitting the model
nr_epoch = 10
batch_sz = 64
val_split = 0.25
story = gru.fit(x_tweets_train_pad, y_tweets_train, batch_size= batch_sz, epochs= nr_epoch, validation_split=val_split)

In [None]:
yhat_gru = gru.predict(x_tweets_test_pad)

In [None]:
auc_gru_own, acc_gru_own, _ = assess_sentiment_classifier(y_test=y_tweets_test,
                            y_pred_prob = yhat_gru)


In [None]:
df_scores['GRU_own_embeddings'] = [acc_gru_own,auc_gru_own]
df_scores

### GRU with Pre-trained Embeddings

In [None]:
!pip install gensim

In [None]:
import gensim
from gensim.models.keyedvectors import Word2VecKeyedVectors

#imdb_index = Word2VecKeyedVectors.load_word2vec_format('w2v_imdb_full_d100_e500.model', binary=False)

In [None]:
from gensim.models.keyedvectors import KeyedVectors

imdb_index = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)


In [None]:
def get_embedding_matrix(tokenizer, pretrain, vocab_size, verbose=0):
    '''
        Helper function to construct an embedding matrix for
        the focal corpus based on some pre-trained embeddings.
    '''

    dim = 0
    # We will use the function with different types of embeddings. Therefore,
    # we need a condition to determine what is the right way of determining
    # the embedding dimension.
    if isinstance(pretrain, Word2VecKeyedVectors):
        dim = pretrain.vector_size
    elif isinstance(pretrain, dict):
        dim = next(iter(pretrain.values())).shape[0]  # get embedding of an arbitrary word
    else:
        raise Exception('{} is not supported'.format(type(pretrain)))


    # Initialize embedding matrix
    emb_mat = np.zeros((vocab_size, dim))

    # There will be some words in our corpus for which we lack a pre-trained embedding.
    # In this tutorial, we will simply use a vector of zeros for such words. We also keep
    # track of the words to do some debugging if needed
    oov_words = []
    # Below we use the tokenizer object that created our task vocabulary. This is crucial to ensure
    # that the position of a words in our embedding matrix corresponds to its index in our integer
    # encoded input data
    v = len(tokenizer.word_index)
    #start = time.time()
    #print('Start embedding process for {} words.'.format(v), flush=True)

    for word, i in tokenizer.word_index.items():
        # try-catch together with a zero-initilaized embedding matrix achieves our rough fix for oov words
        try:
            emb_mat[i] = pretrain[word]
        except:
            oov_words.append(word)
        # Some output that the method is still alive
        if i % 5000 == 0 and verbose>0:
            print('{}/{} words in {} sec'.format(i, v, (time.time()-start)), flush=True)


    #print('Created embedding matrix of shape {} in {} min '.format(emb_mat.shape, (time.time()-start)/60))

    print('Encountered {} out-of-vocabulary words.'.format(len(oov_words)))
    return (emb_mat, oov_words)

In [None]:
# Create embedding weight matrix
imdb_embeddings, oov_words = get_embedding_matrix(tok, imdb_index, vocab_word_number)

In [None]:
# Creating embedding layer using the pre-trained embeddings
pre_trained_emb_layer = Embedding(
    input_dim = vocab_word_number,
    output_dim = imdb_embeddings.shape[1],
    input_length = max_review_length,
    embeddings_initializer = Constant(imdb_embeddings),
    trainable = False
)

In [None]:
# Pre_trained GRU text classifier
number_hidden_nodes = 20    # number of hidden nodes

pre_trained_gru = Sequential()
pre_trained_gru.add(pre_trained_emb_layer)
pre_trained_gru.add(GRU(number_hidden_nodes, activation = 'relu'))
pre_trained_gru.add(Dropout(0.2))
pre_trained_gru.add(Dense(1, activation = 'sigmoid'))

pre_trained_gru.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
print(pre_trained_gru.summary())

In [None]:
# fitting the model
nr_epoch = 10
batch_sz = 64
val_split = 0.25
story = pre_trained_gru.fit(x_tweets_train_pad, y_tweets_train, batch_size= batch_sz, epochs= nr_epoch, validation_split=val_split)

In [None]:
yhat_pre_trained_gru = pre_trained_gru.predict(x_tweets_test_pad)

In [None]:
auc_pre_trained_gru, acc_pre_trained_gru, _ = assess_sentiment_classifier(y_test=y_tweets_test,
                            y_pred_prob = yhat_pre_trained_gru)

In [None]:
df_scores['Pre_Trained_GRU'] = [acc_pre_trained_gru,auc_pre_trained_gru]
df_scores

### Sentiment Analysis Using Pre-Trained Transformer Model

In [None]:
import torch
from transformers import pipeline

classifier = pipeline(
    task="text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    truncation = True
)

In [None]:
def pipeline_classify(data):
    ''' Function to run the sentiment analysis pipeline on each row of a dataset
    and extract the scores. '''

    scores = []
    for row in data:
      output = classifier(row)[0]
      score = output['score']
      label = output['label']
      score = score if label == 'POSITIVE' else -score
      scores.append(score)

    return scores


In [None]:
x_tweets_clean_test.head()

In [None]:
yhat_distilbert = pipeline_classify(x_tweets_clean_test)

In [None]:
yhat_distilbert

In [None]:
pd.DataFrame(yhat_distilbert)

In [None]:
yhat_distilbert_discrete = np.where(pd.DataFrame(yhat_distilbert)>0.5, 1, 0)
acc_distilbert = accuracy_score(y_tweets_test, yhat_distilbert_discrete)
auc_distilbert = roc_auc_score(y_tweets_test, yhat_distilbert_discrete)

In [None]:
yhat_distilbert_discrete = np.where(pd.DataFrame(yhat_distilbert)>0.5, 1, 0)
auc_distilbert, acc_distilbert, _ = assess_sentiment_classifier(y_test=y_tweets_test,
                            y_pred_prob = yhat_distilbert_discrete)

In [None]:
df_scores['DistilBert'] = [acc_distilbert,auc_distilbert]
df_scores

### Fine-Tuning Pre-trained Transformer Model

In [None]:
#Importig Distilbert tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
y_tweets_train.head()

In [None]:
x_tweets_clean_train.head()


In [None]:
# Tokenize training tweetd
x_train_tokenized_distilbert = tokenizer(x_tweets_clean_train.tolist(), truncation=True, padding='max_length')

In [None]:
#Creating a torch dataset for training

class Tweets_Dataset(torch.utils.data.Dataset):
    def __init__(self, content, sentiment):
        self.content = content
        self.sentiment = sentiment

    def __len__(self):
        return len(self.sentiment)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.content.items()}
        item['labels'] = torch.tensor(self.sentiment[idx])
        return item

train_dataset = Tweets_Dataset(x_train_tokenized_dstilbert, y_tweets_train.tolist())

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
# load trainer and set arguments for training
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='ç/results',
    num_train_epochs = 1,
    per_device_train_batch_size=16,
    learning_rate= 5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    seed=111,
    data_seed=111

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset

)

### Data Frame for Result Comparison

In [None]:
df_scores