<a href="https://colab.research.google.com/github/ZhiyaoShu/Kaggle-Lists/blob/main/tweet_sentiment_extraction_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



## Intro: This is a practice of social media sentiment extraction

In [None]:
import tensorflow as tf
print(tf.__version__)

2.14.0


In [None]:
test_data = pd.read_csv("/content/sample_data/0.kaggel/test.csv")
train_data = pd.read_csv("/content/sample_data/0.kaggel/train.csv")

test_data.head()
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
# Check basic information
train_data.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [None]:
test_data.describe()

Unnamed: 0,textID,text,sentiment
count,3534,3534,3534
unique,3534,3534,3
top,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
freq,1,1,1430


In [None]:
# Check for null values of train data
train_data.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [None]:
# Drop null values
train_data.dropna(inplace=True)

In [None]:
# Check for null values of test data
test_data.isna().sum()

textID       0
text         0
sentiment    0
dtype: int64

In [None]:
# Drop null values
test_data.dropna(inplace=True)

In [None]:
# Count seniment values
train_data['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [None]:
# Analysis text length
train_data['text_length'] = train_data['text'].apply(len)
train_data.groupby('sentiment')['text_length'].mean()

sentiment
negative    70.488112
neutral     65.206800
positive    70.419133
Name: text_length, dtype: float64

In [None]:
# Analysis selected text length
train_data['selected_text_length'] = train_data['selected_text'].apply(len)
train_data.groupby('sentiment')['selected_text_length'].mean()


sentiment
negative    19.970698
neutral     62.765134
positive    18.124680
Name: selected_text_length, dtype: float64

In [None]:
# ANOVA results analysis
from scipy import stats

f_val, p_val = stats.f_oneway(train_data[train_data['sentiment'] == 'positive']['text_length'],
                              train_data[train_data['sentiment'] == 'negative']['text_length'],
                              train_data[train_data['sentiment'] == 'neutral']['text_length'])

print("ANOVA Test Results:")
print(f"F-statistic: {f_val}")
print(f"P-value: {p_val}")

# Interpret the results
alpha = 0.05
if p_val<alpha:
     print("The means of at least two groups are significantly different.")
else:
    print("There is no significant difference in the means of the groups.")

ANOVA Test Results:
F-statistic: 72.2127709711816
P-value: 5.254438748898152e-32
The means of at least two groups are significantly different.


## Data Processing

In [None]:
import re
import string

def clean_text(text):
  text = text.lower()
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"can't", "cannot ", text)
  text = re.sub(r"n't", " not ", text)
  return text

train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the text
token = Tokenizer(num_words = 500)
token.fit_on_texts(train_data['text'])

# Convert texts to sequence of integers
train_sequences = token.texts_to_sequences(train_data['text'])
test_sequences = token.texts_to_sequences(test_data['text'])

# Convert labels to categorical one-hot encoding
train_labels = pd.get_dummies(train_data['sentiment']).values
test_labels = pd.get_dummies(test_data['sentiment']).values


In [None]:
# Vectorize the text
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen = max_len, padding = "post", truncating = "post")
test_padded = pad_sequences(test_sequences, maxlen = max_len, padding = "post", truncating = "post")

print(max_len)
print(train_padded.shape)
print(test_padded.shape)


32
(27480, 32)
(3534, 32)


## Build the model

In [None]:
SEED = args.seed
EPOCHS = args.epochs

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [None]:
def lstm_model(max_len):
  model = Sequential()

  # Add embedding layer
  model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))

  # Add LSTM layer
  model.add(LSTM(64, return_sequences=True))
  model.add(Dropout(0.5))

  # Add LSTM layer without returning sequences
  model.add(LSTM(32))
  model.add(Dropout(0.5))

  # Add dense layer
  model.add(Dense(3, activation='sigmoid'))

  return model

def gru_model(max_len):
  model = Sequential()

  # Add embedding layer
  model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))
  # Add GRU layer
  model.add(GRU(128, return_sequences=True))
  model.add(Dropout(0.5))

  # Add GRU layer without returning sequences
  model.add(GRU(32))
  model.add(Dropout(0.5))

  # Add dense layer
  model.add(Dense(3, activation='sigmoid'))

  return model

In [None]:
lstm = lstm_model(max_len)
lstm.summary()
# Complie the models
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
gru_model(max_len).compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_28 (Embedding)    (None, 32, 32)            16000     
                                                                 
 lstm_40 (LSTM)              (None, 32, 64)            24832     
                                                                 
 dropout_56 (Dropout)        (None, 32, 64)            0         
                                                                 
 lstm_41 (LSTM)              (None, 32)                12416     
                                                                 
 dropout_57 (Dropout)        (None, 32)                0         
                                                                 
 dense_27 (Dense)            (None, 3)                 99        
                                                                 
Total params: 53347 (208.39 KB)
Trainable params: 533

In [None]:
gru = gru_model(max_len)
gru.summary()
# Complie the models
gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model: "sequential_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_31 (Embedding)    (None, 32, 32)            16000     
                                                                 
 gru_22 (GRU)                (None, 32, 128)           62208     
                                                                 
 dropout_62 (Dropout)        (None, 32, 128)           0         
                                                                 
 gru_23 (GRU)                (None, 32)                15552     
                                                                 
 dropout_63 (Dropout)        (None, 32)                0         
                                                                 
 dense_30 (Dense)            (None, 3)                 99        
                                                                 
Total params: 93859 (366.64 KB)
Trainable params: 938

## Train the models

In [None]:
# Train the models with early stopping
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

In [None]:
lstm_history = lstm.fit(train_padded, train_labels, epochs=10, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [None]:
gru_history= gru.fit(train_padded, train_labels, epochs=10, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate on test data
test_loss, test_acc = lstm.evaluate(test_padded, test_labels)
print("Test accuracy:", test_acc)

test_loss, test_acc = gru.evaluate(test_padded, test_labels)
print("Test accuracy:", test_acc)

Test accuracy: 0.6723259687423706
Test accuracy: 0.6726089119911194


In [None]:
# Model accuracy
lstm_val_acc = max(lstm_history.history['val_accuracy'])
gru_val_acc = max(gru_history.history['val_accuracy'])

print("LSTM validation accuracy:", lstm_val_acc)
print("GRU validation accuracy:", gru_val_acc)

LSTM validation accuracy: 0.6819505095481873
GRU validation accuracy: 0.6768559217453003


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
lstm_pred = lstm.predict(test_padded)
gru_pred = gru.predict(test_padded)

print(classification_report(test_labels.argmax(axis=1), lstm_pred.argmax(axis=1)))
print(classification_report(test_labels.argmax(axis=1), gru_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.68      0.54      0.60      1001
           1       0.59      0.74      0.66      1430
           2       0.78      0.67      0.72      1103

    accuracy                           0.66      3534
   macro avg       0.69      0.65      0.66      3534
weighted avg       0.68      0.66      0.66      3534

              precision    recall  f1-score   support

           0       0.73      0.50      0.59      1001
           1       0.58      0.81      0.68      1430
           2       0.84      0.65      0.73      1103

    accuracy                           0.67      3534
   macro avg       0.72      0.65      0.67      3534
weighted avg       0.70      0.67      0.67      3534



In [None]:
from datetime import datetime

def save_model(model, prefix =''):
  # Get the current time and filename
  current_time = datetime.now().strftime("%Y-%m-%d-%H%M%S")
  filename = f"{prefix}model_{current_time}.csv"
  # Save the models
  model.save(filename)
  print(f"Model saved to {filename}")

save_model(lstm, prefix = "lstm")
save_model(gru, prefix = "gru")

Model saved to lstmmodel_2023-12-09-072737.csv
Model saved to grumodel_2023-12-09-072745.csv
