<a href="https://colab.research.google.com/github/ale-camer/Data-Science/blob/Finance/Tweets_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The following is a code in which we compare the prediction capabilities of Machine Learning (Decision Tree) and Deep Learning (LSTM) algorithms for text data. More precisely, we will predict whether or not tweets were sent (inbound) to a company. For this purpose we use a dataset from a [Kaggle competition](https://www.kaggle.com/datasets/thoughtvector/customer-support-on-twitter?select=sample.csv). 

If you want to download the dataset directely to your Colab see the following video in [YouTube](https://www.youtube.com/watch?v=T8xEQI8XXGs&ab_channel=AI-SPECIALS). 

## Packages

In [None]:
!pip install unidecode --quiet
import re
import nltk
import sklearn
import warnings
import unidecode
import numpy as np
import pandas as pd
import en_core_web_sm
from nltk.corpus import stopwords
from prettytable import PrettyTable
from nltk.stem import SnowballStemmer
from nltk.tokenize.casual import TweetTokenizer
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential#, Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, SpatialDropout1D
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

## Data

In [None]:
data = pd.read_csv('/content/twcs.csv') # reading data
data, nRows = data[['text','inbound']], 10000 # keeping only necessary columns and 10,000 rows in order to make it quicker. The dataset contains millions of rows.
data['inbound'] = OrdinalEncoder().fit_transform(data[['inbound']]) # discretization of target data
data['inbound'] = data['inbound'].astype('int')
data = data.iloc[:nRows,:]
print(data.head(),f"\n\n Distribution of labels to predict: \n{round(data['inbound'].value_counts(normalize=True),2)}",
      f"\n\n Number of rows: {format(data.shape[0],',d')}")

                                                text  inbound
0  @115712 I understand. I would like to assist y...        0
1      @sprintcare and how do you propose we do that        1
2  @sprintcare I have sent several private messag...        1
3  @115712 Please send us a Private Message so th...        0
4                                 @sprintcare I did.        1 

 Distribution of labels to predict: 
1    0.55
0    0.45
Name: inbound, dtype: float64 

 Number of rows: 10,000


## Functions

In [None]:
class Normalizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

    """
    This function normalize tweet data by:
      - deleting stopwords, words with less than certain length and URLs, 
      - replacing diacritical marks and capital letter,
      - doing lemmatization or stemming 
      - and tokenizing data.  

    Returns a transformed text or a sequence of tokens (text2seq).
    """
    
    def __init__(self, language='english', minWordLen=2, lemmatize=True, stem=False, stripSpeChar=True, stripUrls=True, stripStopwords=True, 
                 text2Seq=False):

        nltk.download('stopwords', quiet=True) # download stopwords
        self.stopwords = set(stopwords.words(language)) # select the language of the stopwords

        self.numAndChar = r'[^a-zA-Z0-9\s]' # RegEx for letters and numbers
        self.urlRegex = re.compile('http\S+') # RegEx to delete URLs

        self.tweetTokenizer = TweetTokenizer() # tokenizer instantiation
        self.parser = en_core_web_sm.load() # parser instantiation

        self.minWordLen = minWordLen # minimum length of the words
        self.stripUrls = stripUrls # boolean to delete URLs
        self.stripStopwords = stripStopwords # boolean to delete stopwords 
        self._text2Seq = text2Seq
        self.stripSpeChar = stripSpeChar

        if stem: # normalization control
            self.stemmer = nltk.stem.SnowballStemmer(language=language) # steammer instantiation
        else:
            self.stemmer = False
        
        if lemmatize:
            self.lemmatizer = lambda word : " ".join([token.lemma_ for token in self.parser(word)]) # lemmatizer instantiation
        else:
            self.lemmatizer = False

    def textProcessor(self, text):

        if (self.stripSpeChar):
          tokens = re.sub(self.numAndChar, '', text.lower()) # replace capital letters and delete special characters
  
        tokens = self.tweetTokenizer.tokenize(tokens) # tweet tokenizer

        if (self.stripUrls): # delete URLs
            tokens = [token for token in tokens if not re.match(self.urlRegex, token)]

        tokens = [token for token in tokens if len(token) > self.minWordLen] # delete words with less than 'x' letters
        tokens = [unidecode.unidecode(token) for token in tokens] # replace diacritical marks

        if (self.stripStopwords): # delete stopwords
            tokens = [token for token in tokens if token not in self.stopwords]

        if self.lemmatizer: # lemmatization
            tokens = [self.lemmatizer(token) for token in tokens]
        if self.stemmer: # stemming
            tokens = [self.stemmer.stem(token) for token in tokens]

        return tokens

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self._text2Seq:
            for doc in X:
                yield self.textProcessor(text=doc)
        else:
            for doc in X:
                yield ' '.join(self.textProcessor(text=doc))

class PadSeqTransf(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

    """
    This function transform a list of sequences of number of samples into a 2D numpy array of number of samples and number of timesteps.
    """

    def __init__(self, max_len=None, padding="pre"):

        self.max_len = max_len
        self.padding = padding

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pad_sequences(list(X), maxlen=self.max_len, padding=self.padding)

## Models

In [None]:
# Constants
maxSeqLen, emdeddingsize, testSize, nClasses, table, nWords, activation, outputName = 50, 100, 0.2, data['inbound'].nunique(), PrettyTable(), 10000, "softmax", "output"
loss, optimizer, metric, mask_zero, dropout, batchSize, epochs = 'sparse_categorical_crossentropy', 'adam', "accuracy", True, 0.2, 124, 3

# Machine Learning Pipeline and Data Split
MLnormalizer, MLvectorizer, MLfeaturizer, MLestimator = Normalizer(), TfidfVectorizer(), LatentDirichletAllocation(), DecisionTreeClassifier()
MLpipeline = Pipeline(steps=[('normalizer', MLnormalizer),('vectorizer', MLvectorizer),('featurizer', MLfeaturizer),('estimator', MLestimator)])
MLX_train, MLX_test, MLy_train, MLy_test = train_test_split(data['text'], data['inbound'], test_size=testSize, stratify=data['inbound'])

# Deep Learning Pipeline and Data Split
DLnormalizer = Normalizer(text2Seq=True)
DLnormalizer, DLtokenizer = list(DLnormalizer.transform(data["text"])), Tokenizer(num_words=nWords)
DLtokenizer.fit_on_texts(DLnormalizer)
inputTweets = DLtokenizer.texts_to_sequences(DLnormalizer)
vocabSize, padder = DLtokenizer.num_words + 1, PadSeqTransf(max_len=maxSeqLen)
inputTweetsPadded = padder.transform(inputTweets)
inputTweets =  np.array(inputTweetsPadded).astype('int32')

DLtrain, DLtest = train_test_split(range(len(inputTweets)),test_size=testSize,stratify=data['inbound'])
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(data["inbound"])
nClasses2 = len(label_encoder.classes_)
DLX_train, DLy_train, DLX_test, DLy_test = np.array(inputTweets[DLtrain]).astype('int32'), np.array(label_encoder.transform(data["inbound"][DLtrain])), \
  np.array(inputTweets[DLtest]).astype('int32'), np.array(label_encoder.transform(data["inbound"][DLtest]))
DLmodel = Sequential([Embedding(input_dim=vocabSize,input_length=maxSeqLen,output_dim=emdeddingsize,mask_zero=mask_zero),SpatialDropout1D(dropout),
                      LSTM(emdeddingsize),Dense(nClasses2, activation=activation, name=outputName)])
DLmodel.compile(loss=loss,optimizer=optimizer,metrics=[metric])

# Prediction and Evaluation
table.field_names = ['Machine Learning Accuracy', 'Deep Learning Accuracy', 'Which one is better?']
table.title = 'Model Comparison'

for _ in range(10):

  MLmodel = MLpipeline.fit(X=MLX_train, y=MLy_train)
  MLpreds = MLmodel.predict(MLX_test)
  MLacc = round(accuracy_score(MLy_test, MLpreds)*100,2)

  DLmodelFit = DLmodel.fit(DLX_train,DLy_train,batch_size=batchSize,epochs=epochs,validation_data=(DLX_test, DLy_test),verbose=0)
  DLpreds = DLmodel.predict(DLX_test).argmax(axis=-1)
  DLacc = round(accuracy_score(DLy_test, DLpreds)*100,2)
  
  if MLacc > DLacc:
    statement = "Machine Learning"
  else:
    statement = "Deep Learning"

  table.add_row([f"{MLacc}%",f"{DLacc}%",statement])
print(table)

+---------------------------------------------------------------------------+
|                              Model Comparison                             |
+---------------------------+------------------------+----------------------+
| Machine Learning Accuracy | Deep Learning Accuracy | Which one is better? |
+---------------------------+------------------------+----------------------+
|           60.15%          |         96.5%          |    Deep Learning     |
|           61.15%          |         96.6%          |    Deep Learning     |
|           58.2%           |         96.35%         |    Deep Learning     |
|           58.65%          |         96.4%          |    Deep Learning     |
|           58.15%          |         96.1%          |    Deep Learning     |
|           63.8%           |         96.05%         |    Deep Learning     |
|           65.15%          |         96.05%         |    Deep Learning     |
|           60.85%          |         95.95%         |    Deep L