<a href="https://colab.research.google.com/github/ajaypavuluri87/final-exam/blob/main/final%20exam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer # Changed import statement
from tensorflow.keras.preprocessing.sequence import pad_sequences # Changed import statement
from tensorflow.keras.models import Sequential, load_model # Changed import statement
from tensorflow.keras.layers import Dense, Embedding, LSTM # Changed import statement
from tensorflow.keras.utils import to_categorical # Changed import statement
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")


# 1. Load and preprocess the dataset
# If the file is not in '/content/', replace 'Data (1).csv' with the correct path.
# For example, if it's in the current directory:
# data = pd.read_csv('Data (1).csv')
data = pd.read_csv('/content/Data (1).csv') # Make sure the path is correct
data = data[['text', 'sentiment']]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
data['text'] = data['text'].str.replace('rt', ' ')
# 2. Tokenization
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

# 3. Encode target
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)

# 4. Train/Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 5. Model creation function
def create_model(units=196, dropout_rate=0.2):
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=X.shape[1]))
    model.add(LSTM(units, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# 6. Train and save the model
model = create_model(units=196, dropout_rate=0.2)
model.fit(X_train, Y_train, epochs=3, batch_size=32, verbose=2)
model.save("sentiment_lstm_model.keras")
print("Model saved.")

# 7. Load and predict on new input
model = load_model("sentiment_lstm_model.keras")
new_text = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing .@realDonaldTrump"]
new_text = [re.sub('[^a-zA-Z0-9\s]', '', x.lower().replace('rt', ' ')) for x in new_text]
new_seq = tokenizer.texts_to_sequences(new_text)
new_pad = pad_sequences(new_seq, maxlen=X.shape[1])
pred = model.predict(new_pad)
pred_class = labelencoder.inverse_transform([np.argmax(pred)])
print("Predicted sentiment:", pred_class[0])

Epoch 1/3
291/291 - 37s - 128ms/step - accuracy: 0.6379 - loss: 0.8363
Epoch 2/3
291/291 - 34s - 117ms/step - accuracy: 0.7062 - loss: 0.6917
Epoch 3/3
291/291 - 41s - 140ms/step - accuracy: 0.7382 - loss: 0.6221
Model saved.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392ms/step
Predicted sentiment: Positive


In [5]:
# Try different values:
units_list = [128, 196]
dropouts = [0.2, 0.3]
batches = [32, 64]

for units in units_list:
    for dropout in dropouts:
        for batch in batches:
            print(f"\nTraining with units={units}, dropout={dropout}, batch={batch}")
            model = create_model(units=units, dropout_rate=dropout)
            model.fit(X_train, Y_train, epochs=2, batch_size=batch, verbose=2)
            loss, acc = model.evaluate(X_test, Y_test, verbose=0)
            print(f"Validation Accuracy: {acc:.4f}")



Training with units=128, dropout=0.2, batch=32
Epoch 1/2
291/291 - 29s - 100ms/step - accuracy: 0.6363 - loss: 0.8347
Epoch 2/2
291/291 - 25s - 86ms/step - accuracy: 0.7058 - loss: 0.6878
Validation Accuracy: 0.6741

Training with units=128, dropout=0.2, batch=64
Epoch 1/2
146/146 - 17s - 115ms/step - accuracy: 0.6321 - loss: 0.8488
Epoch 2/2
146/146 - 20s - 138ms/step - accuracy: 0.7005 - loss: 0.6983
Validation Accuracy: 0.6697

Training with units=128, dropout=0.3, batch=32
Epoch 1/2
291/291 - 29s - 99ms/step - accuracy: 0.6368 - loss: 0.8418
Epoch 2/2
291/291 - 25s - 87ms/step - accuracy: 0.7063 - loss: 0.6964
Validation Accuracy: 0.6684

Training with units=128, dropout=0.3, batch=64
Epoch 1/2
146/146 - 17s - 114ms/step - accuracy: 0.6267 - loss: 0.8630
Epoch 2/2
146/146 - 13s - 86ms/step - accuracy: 0.6892 - loss: 0.7147
Validation Accuracy: 0.6765

Training with units=196, dropout=0.2, batch=32
Epoch 1/2
291/291 - 29s - 101ms/step - accuracy: 0.6440 - loss: 0.8329
Epoch 2/2
291

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Load data
df = pd.read_csv("/content/Data (1).csv")

# Filter relevant and confident samples
df_filtered = df[(df['relevant_yn'] == 'yes') & (df['sentiment_confidence'] > 0.6)]
X = df_filtered['text']
y = df_filtered['sentiment']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# Define parameter grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10]
}

# Apply GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate
y_pred = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the best model
joblib.dump(grid_search.best_estimator_, "best_sentiment_model.pkl")

# Predict on example text
example = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing."]
prediction = grid_search.predict(example)
print("Prediction for example text:", prediction[0])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.86      0.83      1577
     Neutral       0.53      0.45      0.49       521
    Positive       0.63      0.60      0.61       344

    accuracy                           0.73      2442
   macro avg       0.66      0.63      0.64      2442
weighted avg       0.72      0.73      0.73      2442

Prediction for example text: Positive
