In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

In [2]:
# Download NLTK stopwords
nltk.download('stopwords')

# Read the data from the given file
df = pd.read_csv('Twitter_Data.csv')

# Change dependent variable to categorical
df['category'] = df['category'].map({0: 'Neutral', -1: 'Negative', 1: 'Positive'})

# Drop null/missing values
df.dropna(inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

In [5]:
df['clean_text'] = df['clean_text'].apply(clean_text)

# Create a new column for length of each sentence
df['sentence_length'] = df['clean_text'].apply(lambda x: len(x.split()))

# Split data into dependent(X) and independent(y) dataframe
X = df['clean_text']
y = df['category']

# Tokenization and padding
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
X = tokenizer.texts_to_sequences(df['clean_text'])
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='pre')

In [6]:
# Convert categories to numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])

# Build an LSTM model and compile it
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10


In [7]:
# Save the model and tokenizer
model.save('sentiment_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [8]:
# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Measure performance metrics and accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
classification_rep = classification_report(y_test, y_pred_classes, target_names=['Negative', 'Neutral', 'Positive'])

print("Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)

# Save evaluation metrics
with open('metrics.txt', 'w') as f:
    f.write(f"Model Accuracy: {accuracy}\n")
    f.write("Classification Report:\n")
    f.write(classification_rep)
