# Sentiment Analysis and Classification of Twitter Data (ResNet and LSTM)
Andrés Ocabo (andoc277)


# Read the data

In [1]:
! unzip drive/MyDrive/project_text/data2.zip #Unzip the file

Archive:  drive/MyDrive/project_text/data2.zip
  inflating: Reddit_Data.csv         
  inflating: Twitter_Data.csv        


In [15]:
import pandas as pd
import numpy as np
import csv

df = pd.read_csv('Twitter_Data.csv')

#This line is for using the Reddit data file
#df = df.rename(columns = {'clean_comment': 'clean_text'}, inplace = False)

#Remove all samples with null in class or document
df = df[pd.notnull(df.clean_text) & pd.notnull(df.category)]

#Show the first rows of the data frame
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


## Text processing
Here, the dataset is divided anf the k-fold manager funtion too. Also here we have the 3 different processing of the data for extracting the features that will be used during the training and testing the models.

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

seed = 4321
x_train_data, x_test_data, y_train, y_test = train_test_split(df['clean_text'],df['category'],test_size=0.25, 
                                                    random_state = seed)
x_train_data = x_train_data.reset_index()['clean_text']
x_test_data = x_test_data.reset_index()['clean_text']
y_train = y_train.reset_index()['category']
y_test = y_test.reset_index()['category']

k = 5
kfold = KFold(n_splits=k, shuffle = True, random_state = seed)

### Processing with Keras
text_to_sequence: https://www.kdnuggets.com/2020/03/tensorflow-keras-tokenization-text-data-prep.html

In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_fatures = 1000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(x_train_data.values)

X1_train = tokenizer.texts_to_sequences(x_train_data.values)
#X1_train = tokenizer.texts_to_matrix(x_train_data.values, mode = 'tfidf')
#print(type(X1_train))
X1_test = tokenizer.texts_to_sequences(x_test_data.values)
#X1_test = tokenizer.texts_to_matrix(x_test_data.values, mode = 'tfidf')


n_test = len(X1_test)
n_train = len(X1_train)
X = X1_train + X1_test

X = pad_sequences(X)
x_train = X[:n_train,:]
x_test = X[-n_test:,:]

#x_train = pad_sequences(X1_train)
#x_test = pad_sequences(X1_test)
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(122226, 47) (122226, 3)
(40743, 47) (40743, 3)


### Processing with Sklearn

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import spacy
from nltk.corpus import stopwords
# The data in english is loaded and "tagger, "parser", "ner" and "textcat" are removed from the pipeline.
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner","textcat"])
stop_words = stopwords.words('english')
#Preprocess function is defined
def preprocess(text):
    doc = nlp(text)
    out = [word.lemma_ for word in doc if not(word.lemma_ in stop_words) and nlp(word.lemma_)[0].is_alpha]
    #out = [word.lemma_ for word in doc if not(word.is_stop) and nlp(word.lemma_)[0].is_alpha]
    #out = [word.lemma_ for word in doc if nlp(word.lemma_)[0].is_alpha]
    #out = [word.lower_ for word in doc]
    
    return out

#### Tfidf-vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer = preprocess, max_features = 1000, ngram_range = (1,2))
x_train = vectorizer.fit_transform(x_train_data) #It takes less than 1 min
x_test = vectorizer.transform(x_test_data) # It takes less than 30 sec

x_train = x_train.toarray()
x_test = x_test.toarray()
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(122226, 1000) (122226, 3)
(40743, 1000) (40743, 3)


#### Count-vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer = preprocess, max_features = 1000, ngram_range = (1,2))
x_train = vectorizer.fit_transform(x_train_data) #It takes less than 1 min
x_test = vectorizer.transform(x_test_data) # It takes less than 30 sec

x_train = x_train.toarray()
x_test = x_test.toarray()
y_train = pd.get_dummies(y_train).values
y_test = pd.get_dummies(y_test).values

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(122226, 1000) (122226, 3)
(40743, 1000) (40743, 3)


# Training and Evaluation

In [5]:
# Class defined to save and show the results during the k-fold cross validation
class Results:
    def __init__(self, k):
        self.results = {'Negative':{'precision': 0, 'recall': 0, 'f1-score': 0},
           'Neutral':{'precision': 0, 'recall': 0, 'f1-score': 0},
           'Positive':{'precision': 0, 'recall': 0, 'f1-score': 0},
           'Accuracy':0
          }
        self.k = k
    def save(self, report):
        #Calculation and esults
        self.results['Negative']['precision'] = self.results['Negative']['precision'] + report['0']['precision']/self.k
        self.results['Neutral']['precision'] = self.results['Neutral']['precision'] + report['1']['precision']/self.k
        self.results['Positive']['precision'] = self.results['Positive']['precision'] + report['2']['precision']/self.k

        self.results['Negative']['recall'] = self.results['Negative']['recall'] + report['0']['recall']/self.k
        self.results['Neutral']['recall'] = self.results['Neutral']['recall'] + report['1']['recall']/self.k
        self.results['Positive']['recall'] = self.results['Positive']['recall'] + report['2']['recall']/self.k

        self.results['Negative']['f1-score'] = self.results['Negative']['f1-score'] + report['0']['f1-score']/self.k
        self.results['Neutral']['f1-score'] = self.results['Neutral']['f1-score'] + report['1']['f1-score']/self.k
        self.results['Positive']['f1-score'] = self.results['Positive']['f1-score'] + report['2']['f1-score']/self.k

        self.results['Accuracy'] = self.results['Accuracy'] + report['accuracy']/self.k
        
        return self.results
    
    def show(self):
        import pprint
        pprint.pprint(self.results)

## LSTM

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

def get_lstm(input_len, show = False):
  embed_dim = 150
  lstm_out = 200
  max_fatures = 1000
  model = Sequential()
  model.add(Embedding(max_fatures, embed_dim,input_length = input_len))
  model.add(LSTM(lstm_out, dropout=0.2))
  model.add(Dense(3,activation='softmax'))
  model.compile(loss = 'categorical_crossentropy', optimizer='nadam',metrics = ['accuracy'])
  if show:
    print(model.summary())
  return model


In [8]:
# Plotting a summary of the model
model = get_lstm(input_len = x_train.shape[1], show = True)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 150)           150000    
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dense (Dense)                (None, 3)                 603       
Total params: 431,403
Trainable params: 431,403
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
from sklearn.metrics import classification_report
results = Results(k)
i = 0
for train, val in kfold.split(x_train):
  i = i + 1
  print('Fold ', i, '/', k)
  X_train, Y_train = x_train[train], y_train[train]
  X_val, Y_val = x_train[val], y_train[val]
  
    
  model = get_lstm(input_len = X_train.shape[1])
  batch_size = 32
  print('Training')
  model.fit(X_train, Y_train, epochs = 3, batch_size=batch_size, verbose = 1)
  print('Validation')
  Y_pred = model.predict_classes(X_val, verbose = 1)

  Y_val = np.argmax(Y_val, axis=1)
  report = classification_report(Y_val, Y_pred, output_dict= True)
  results.save(report)
results.show()

Fold  1 / 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation




Fold  2 / 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation
Fold  3 / 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation
Fold  4 / 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation
Fold  5 / 5
Training
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation
{'Accuracy': 0.8399931239616182,
 'Negative': {'f1-score': 0.7331127416913231,
              'precision': 0.8685614297279498,
              'recall': 0.6343457982124983},
 'Neutral': {'f1-score': 0.8548322198785359,
             'precision': 0.7582181261267271,
             'recall': 0.9797135641738867},
 'Positive': {'f1-score': 0.8737140313411055,
              'precision': 0.9181581025336456,
              'recall': 0.8334397226283516}}


In [10]:
batch_size = 32
model = get_lstm(input_len = X_train.shape[1], show = False)
model.fit(x_train, y_train, epochs = 3, batch_size=batch_size, verbose = 1)

print('TRAIN RESULTS:')
results = Results(1)
y_pred = model.predict_classes(x_train, verbose = 1)
y_train = np.argmax(y_train, axis=1)
report = classification_report(y_train, y_pred, output_dict= True)
results.save(report)
results.show()

print('TEST RESULTS:')
results = Results(1)
y_pred = model.predict_classes(x_test, verbose = 1)
y_test = np.argmax(y_test, axis=1)
report = classification_report(y_test, y_pred, output_dict= True)
results.save(report)
results.show()

Epoch 1/3
Epoch 2/3
Epoch 3/3
TRAIN RESULTS:




{'Accuracy': 0.8440675470030926,
 'Negative': {'f1-score': 0.7446734799930712,
              'precision': 0.8740469655382739,
              'recall': 0.6486608826857789},
 'Neutral': {'f1-score': 0.8562692412406543,
             'precision': 0.7578639085061818,
             'recall': 0.9840429372548076},
 'Positive': {'f1-score': 0.8773961272744966,
              'precision': 0.9276176367713928,
              'recall': 0.8323333394871416}}
TEST RESULTS:
{'Accuracy': 0.8372726603342906,
 'Negative': {'f1-score': 0.7373460139145976,
              'precision': 0.8662267546490702,
              'recall': 0.641849094343816},
 'Neutral': {'f1-score': 0.8497222923504166,
             'precision': 0.7468656385221347,
             'recall': 0.985434050651442},
 'Positive': {'f1-score': 0.8715830183703964,
              'precision': 0.9267867156832201,
              'recall': 0.8225859971242119}}


## ResNet
explicacion resnet: https://towardsdatascience.com/an-overview-of-resnet-and-its-variants-5281e2f56035

In [18]:
from keras.layers import Input, Conv1D, Activation, BatchNormalization, GlobalAveragePooling1D, Dense, Dropout, MaxPool1D
from keras.layers.merge import add
from keras.activations import relu, softmax
from keras.models import Model
from keras import regularizers


def block(n_output, upscale=False):
    # n_output: number of feature maps in the block
    # upscale: should we use the 1x1 conv2d mapping for shortcut or not
    
    # keras functional api: return the function of type
    # Tensor -> Tensor
    def f(x):
        
        
        # first convolution
        h = Conv1D(kernel_size=3, filters=n_output, strides=1, padding='same', kernel_regularizer=regularizers.l2(0.01))(x)
        h = BatchNormalization()(h)
        h = Activation(softmax)(h)
        
        # second convolution
        h = Conv1D(kernel_size=3, filters=n_output, strides=1, padding='same', kernel_regularizer=regularizers.l2(0.01))(h)
        h = BatchNormalization()(h)
        h = Activation(softmax)(h)
        #f = Conv1D(kernel_size=1, filters=n_output, strides=1, padding='same')(x)
        # f(x):
        if upscale:
            # 1x1 conv1d
            f = Conv1D(kernel_size=1, filters=n_output, strides=1, padding='same')(x)
        else:
            # identity
            f = x
        
        return add([f, h])
    
    return f



In [19]:
def get_ResNet(input_len, show = False):
  input_tensor = Input((input_len, 1))
  x = Conv1D(kernel_size=7, filters=32, strides=1, padding='same', kernel_regularizer=regularizers.l2(0.01))(input_tensor)
  x = MaxPool1D(pool_size=2)(x)
  x = block(32)(x)
  x = block(32)(x)
  x = block(32)(x)
  x = block(64, upscale=True)(x)
  x = block(64)(x)
  x = block(64)(x)
  x = block(128, upscale=True)(x)
  x = block(128)(x)
  x = block(128)(x)
  x = block(512, upscale=True)(x)
  x = block(512)(x)
  x = GlobalAveragePooling1D()(x)
  x = Dropout(0.2)(x)
  x = Dense(3)(x)
  x = Activation(softmax)(x)
  model = Model(inputs=input_tensor, outputs=x)
  model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
  if show:
    print(model.summary())
  return model



In [20]:
model = get_ResNet(x_train.shape[1], show=True)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 47, 1)]      0                                            
__________________________________________________________________________________________________
conv1d_52 (Conv1D)              (None, 47, 32)       256         input_3[0][0]                    
__________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D)  (None, 23, 32)       0           conv1d_52[0][0]                  
__________________________________________________________________________________________________
conv1d_53 (Conv1D)              (None, 23, 32)       3104        max_pooling1d_2[0][0]            
____________________________________________________________________________________________

In [21]:
results = Results(k)
i = 0
for train, val in kfold.split(x_train):
  i = i + 1
  print('Fold ', i, '/', k)
  X_train, Y_train = x_train[train], y_train[train]
  X_val, Y_val = x_train[val], y_train[val]
  
  model = get_ResNet(X_train.shape[1])
  batch_size = 32
  model.fit(X_train[:,:,np.newaxis], Y_train, epochs = 3, batch_size=batch_size, verbose = 1)
  Y_pred = model.predict(X_val[:,:,np.newaxis], verbose = 1)

  Y_pred = Y_pred.argmax(axis = 1)
  Y_val = Y_val.argmax(axis = 1)
  report = classification_report(Y_val, Y_pred, output_dict= True)
  results.save(report)
results.show()

Fold  1 / 5
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold  2 / 5


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold  3 / 5
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold  4 / 5
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold  5 / 5
Epoch 1/3
Epoch 2/3
Epoch 3/3
{'Accuracy': 0.5208712216742235,
 'Negative': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
 'Neutral': {'f1-score': 0.5724426058510513,
             'precision': 0.5235517524896603,
             'recall': 0.6326415476048377},
 'Positive': {'f1-score': 0.5925520107880711,
              'precision': 0.5194637039738093,
              'recall': 0.6901617954598345}}


In [None]:
from sklearn.metrics import classification_report
report = classification_report(Y_val, Y_pred, output_dict= True)
results.save(report)
results.show()

{'Accuracy': 0.52163953202978,
 'Negative': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
 'Neutral': {'f1-score': 0.5916855631141346,
             'precision': 0.5064262917277668,
             'recall': 0.711463887542414},
 'Positive': {'f1-score': 0.5821981555123107,
              'precision': 0.5353613942270287,
              'recall': 0.638015762633287}}


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
model = get_ResNet(x_train.shape[1])
batch_size = 32
model.fit(x_train[:,:,np.newaxis], y_train, epochs = 2, batch_size=batch_size, verbose = 1)

print('TRAIN RESULTS:')
results = Results(1)
y_pred = model.predict(x_train[:,:,np.newaxis], verbose = 1)
y_pred = y_pred.argmax(axis = 1)
y_train = y_train.argmax(axis = 1)
report = classification_report(y_train, y_pred, output_dict= True)
results.save(report)
results.show()


print('TEST RESULTS:')
results = Results(1)
y_pred = model.predict(x_test[:,:,np.newaxis], verbose = 1)
y_pred = y_pred.argmax(axis = 1)
y_test = y_test.argmax(axis = 1)
report = classification_report(y_test, y_pred, output_dict= True)
results.save(report)
results.show()



Epoch 1/2
Epoch 2/2
TRAIN RESULTS:
{'Accuracy': 0.5220329553450166,
 'Negative': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
 'Neutral': {'f1-score': 0.5766834257230029,
             'precision': 0.5208110992529349,
             'recall': 0.6459842595489663},
 'Positive': {'f1-score': 0.5921286581556648,
              'precision': 0.5229237102318541,
              'recall': 0.6824450311075009}}
TEST RESULTS:
   1/1274 [..............................] - ETA: 25s

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.5212183687995484,
 'Negative': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0},
 'Neutral': {'f1-score': 0.5797214376156617,
             'precision': 0.5209172063714336,
             'recall': 0.6534914361001317},
 'Positive': {'f1-score': 0.5905100033584416,
              'precision': 0.5214370445687172,
              'recall': 0.6806769162703241}}
