In [3]:
from google.colab import files
uploaded = files.upload()
uploaded = files.upload()



Saving raw_sentiment_dataset.csv to raw_sentiment_dataset.csv


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv("raw_sentiment_dataset.csv")

# Define stopwords
stop_words = set(stopwords.words('english'))

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Keep emotive punctuation (!, ?) and contractions
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)  # Remove URLs/hashtags
    text = re.sub(r'[^\w\s!?\']', '', text)  # Keep apostrophes for contractions (e.g., "don't")
    text = text.lower().strip()
    return text


    # text = text.lower()  # Lowercase
    # text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    # text = re.sub(r'\d+', '', text)  # Remove numbers
    # tokens = text.split()  # Tokenize
    # tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    # return " ".join(tokens)

# Apply preprocessing
df['cleaned_sentence'] = df['sentence'].apply(preprocess_text)

# Save to new CSV
df[['cleaned_sentence', 'sentiment']].to_csv("preprocessed_dataset.csv", index=False)

# Preview first few rows
df[['sentence', 'cleaned_sentence', 'sentiment']].head()

# files.download("preprocessed_dataset.csv")


Unnamed: 0,sentence,cleaned_sentence,sentiment
0,I had a wonderful day at the park.,i had a wonderful day at the park,Positive
1,She smiled brightly at the surprise.,she smiled brightly at the surprise,Positive
2,The movie was absolutely amazing.,the movie was absolutely amazing,Positive
3,I love spending time with my family.,i love spending time with my family,Positive
4,The food tasted delicious and fresh.,the food tasted delicious and fresh,Positive


In [6]:


# Load your cleaned/preprocessed CSV
df = pd.read_csv("preprocessed_dataset.csv")

# Tokenization using simple split()
def tokenize_text(text):
    return text.split()

# Apply tokenization
df['tokenized_sentence'] = df['cleaned_sentence'].apply(tokenize_text)

# Keep only sentence, tokenized sentence, and actual sentiment
final_df = df[['cleaned_sentence', 'sentiment', 'tokenized_sentence']]
final_df.rename(columns={'cleaned_sentence': 'sentence'}, inplace=True)

# Save to new CSV
final_df.to_csv("final_tokenized_dataset.csv", index=False)

# Preview first few rows
final_df.head()
# files.download("final_tokenized_dataset.csv")



Unnamed: 0,sentence,sentiment,tokenized_sentence
0,i had a wonderful day at the park,Positive,"[i, had, a, wonderful, day, at, the, park]"
1,she smiled brightly at the surprise,Positive,"[she, smiled, brightly, at, the, surprise]"
2,the movie was absolutely amazing,Positive,"[the, movie, was, absolutely, amazing]"
3,i love spending time with my family,Positive,"[i, love, spending, time, with, my, family]"
4,the food tasted delicious and fresh,Positive,"[the, food, tasted, delicious, and, fresh]"


In [7]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip
!mkdir -p glove_data
!wget -P glove_data http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove_data/glove.6B.zip -d glove_data

--2025-05-05 10:29:50--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-05 10:29:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-05 10:29:51--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove_data/glove.6B

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 1. Load data
df = pd.read_csv("final_tokenized_dataset.csv")

# 2. Load GloVe vectors - CHANGED PATH
embeddings = {}
with open('glove_data/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.array(parts[1:], dtype='float32')
        embeddings[word] = vector

# 3. Convert tokenized sentences to text
df['joined_sentence'] = df['tokenized_sentence'].apply(lambda x: " ".join(eval(x)))

# 4. Convert sentences to GloVe vectors
def sentence_to_vector(sentence):
    words = sentence.split()
    word_vectors = [embeddings[word] for word in words if word in embeddings]
    if len(word_vectors) == 0:
        return np.zeros(100)
    return np.mean(word_vectors, axis=0)

X_glove = np.array([sentence_to_vector(text) for text in df['joined_sentence']])

# 5. Encode labels
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_glove, y, test_size=0.2, random_state=42)

# 7. Train Logistic Regression
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# 8. Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# [Keep all your previous code until after y_pred is created]

# Simplified error display
print("\n❌ Incorrect Predictions:")
for i in range(len(X_test)):
    if y_test[i] != y_pred[i]:
        print(f"\nText: {df['joined_sentence'].iloc[i]}")
        print(f"Actual: {le.inverse_transform([y_test[i]])[0]}")
        print(f"Predicted: {le.inverse_transform([y_pred[i]])[0]}")

# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.preprocessing import LabelEncoder

# # Load tokenized dataset
# df = pd.read_csv("final_tokenized_dataset.csv")

# # Join tokenized words back into string
# df['joined_sentence'] = df['tokenized_sentence'].apply(lambda x: " ".join(eval(x)))

# # Encode sentiment labels
# le = LabelEncoder()
# df['label'] = le.fit_transform(df['sentiment'])

# # TF-IDF vectorization
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df['joined_sentence'])
# y = df['label']

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# # Train model
# model = LogisticRegression()
# model.fit(X_train, y_train)



# # Evaluate
# y_pred = model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred, target_names=le.classes_))

# # Step 5: Show Wrong Predictions (BONUS!)
# print("\n❌ Incorrect Predictions:")
# for text, actual, predicted in zip(X_test, y_test, y_pred):
#     if actual != predicted:
#         print(f"\n📝 Text: {text}")
#         print(f"✅ Actual: {actual} | ❌ Predicted: {predicted}")


# print(df['sentiment'].value_counts())




Accuracy: 0.79
              precision    recall  f1-score   support

    Negative       0.86      0.60      0.71        10
     Neutral       1.00      1.00      1.00         6
    Positive       0.64      0.88      0.74         8

    accuracy                           0.79        24
   macro avg       0.83      0.83      0.81        24
weighted avg       0.82      0.79      0.79        24


❌ Incorrect Predictions:

Text: i had a wonderful day at the park
Actual: Negative
Predicted: Positive

Text: she smiled brightly at the surprise
Actual: Negative
Predicted: Positive

Text: the food tasted delicious and fresh
Actual: Positive
Predicted: Negative

Text: he did a fantastic job on the project
Actual: Negative
Predicted: Positive

Text: he made my day with a kind word
Actual: Negative
Predicted: Positive


In [9]:
# Get the training sentences and labels
import numpy as np

# Get the indices used in the train-test split
indices = np.arange(len(df))
_, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
train_indices = np.setdiff1d(indices, test_indices)

# Create training DataFrame
train_df = pd.DataFrame({
    'sentence': df.loc[train_indices, 'joined_sentence'].values,
    'sentiment': le.inverse_transform(y_train)
})

# Save to CSV
train_df.to_csv("training_data.csv", index=False)

# Download from Colab (optional)
# files.download("training_data.csv")

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("final_tokenized_dataset.csv")

# Join tokenized words back to a single string
df['joined_sentence'] = df['tokenized_sentence'].apply(lambda x: " ".join(eval(x)))

# Store original sentiments
df['actual_sentiment'] = df['sentiment']

# Label encode the original sentiment column
le = LabelEncoder()
df['label'] = le.fit_transform(df['actual_sentiment'])

# Convert sentences to GloVe vectors (using your existing function)
def sentence_to_vector(sentence):
    words = sentence.split()
    word_vectors = [embeddings[word] for word in words if word in embeddings]
    if len(word_vectors) == 0:
        return np.zeros(100)
    return np.mean(word_vectors, axis=0)

X_glove = np.array([sentence_to_vector(text) for text in df['joined_sentence']])

# Split data before training
X_train, X_test, y_train, y_test = train_test_split(X_glove, df['label'], test_size=0.2, random_state=42)

# Train model on training set only
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Predict on test set only
predicted_labels = model.predict(X_test)
predicted_sentiments = le.inverse_transform(predicted_labels)
actual_sentiments = le.inverse_transform(y_test)

# Get corresponding sentences from original dataframe
test_indices = y_test.index  # Get the original indices
test_sentences = df.loc[test_indices, 'joined_sentence'].values

# Function to remove stopwords
stop_words = set(stopwords.words('english'))
def clean_tokens(sentence):
    return [word for word in sentence.split() if word.lower() not in stop_words]

# Build new DataFrame for confidence report
confidence_df = pd.DataFrame({
    'sentence': test_sentences,
    'actual_sentiment': actual_sentiments,
    'predicted_sentiment': predicted_sentiments,
    'token': [clean_tokens(sent) for sent in test_sentences]  # Stopwords removed here
})

# Function to get confidence for each word (modified for GloVe)
def get_confidence_per_word(sentence):
    words = sentence.split()
    all_probs = []
    for word in words:
        if word in embeddings:
            # Get the word's GloVe vector
            word_vector = embeddings[word].reshape(1, -1)
            probs = model.predict_proba(word_vector)[0]
            probs = (probs * 100).round(2)
        else:
            # Word not in GloVe vocabulary
            probs = [0.0, 0.0, 0.0]
        all_probs.append(f"{{{probs[2]}, {probs[0]}, {probs[1]}}}")  # positive, negative, neutral
    return "\n".join(all_probs)

# Apply confidence function
# confidence_df['token'] = confidence_df['sentence'].apply(lambda x: x.split())
confidence_df['confidence(+ev, -ev, neutral)'] = confidence_df['sentence'].apply(get_confidence_per_word)
# Add this RIGHT BEFORE your CSV save
def get_avg_confidence(confidence_str):
    try:
        probs = []
        for p in confidence_str.split('\n'):
            if p:  # Skip empty lines
                p = p.strip('{}')
                pos, neg, neu = map(float, p.split(','))
                probs.append([pos, neg, neu])
        avg = np.mean(probs, axis=0) if probs else [0, 0, 0]
        return f"({avg[0]:.1f}, {avg[1]:.1f}, {avg[2]:.1f})"
    except:
        return "(0.0, 0.0, 0.0)"  # Fallback for errors

confidence_df['avg_confidence'] = confidence_df['confidence(+ev, -ev, neutral)'].apply(get_avg_confidence)


# Save to CSV
confidence_df.to_csv("sentence_confidence_report.csv", index=False)

In [11]:
# Get misclassified examples
misclassified = df.loc[y_test.index[y_test != y_pred]][['joined_sentence', 'sentiment']].copy()
misclassified['predicted_sentiment'] = le.inverse_transform(y_pred[y_test != y_pred])

# Add cleanly formatted confidence percentages
misclassified['class_confidence'] = [
    tuple(round(x*100, 1) for x in model.predict_proba(X_test[i].reshape(1, -1))[0])
    for i in np.where(y_test != y_pred)[0]
]

# Convert tuples to cleaner string representation if needed
misclassified['class_confidence'] = misclassified['class_confidence'].apply(
    lambda x: f"({x[0]:.1f}, {x[1]:.1f}, {x[2]:.1f})"
)

# Save to CSV
misclassified.to_csv("misclassified_examples.csv", index=False)
print("Saved misclassified_examples.csv")

Saved misclassified_examples.csv


In [12]:
# Get correct predictions
correct_indices = np.where(y_test == y_pred)[0]

# Create DataFrame with clean confidence formatting
correct_df = pd.DataFrame({
    'raw_sentence': [df['sentence'].values[i] for i in test_indices[correct_indices]],
    'actual_sentiment': actual_sentiments[correct_indices],
    'predicted_sentiment': predicted_sentiments[correct_indices],
    'confidence': [np.round(model.predict_proba(X_test[i].reshape(1, -1)).max(), 3)
                 for i in correct_indices],
    # Clean formatted confidence percentages (Positive, Negative, Neutral)
    'class_confidence': [
        f"({prob[2]*100:.1f}, {prob[0]*100:.1f}, {prob[1]*100:.1f})"
        for prob in model.predict_proba(X_test[correct_indices])
    ]
})

# Save to CSV
correct_df.to_csv("correct_predictions.csv", index=False)

In [13]:
from joblib import dump

# Save the model and label encoder (2 lines of code)
dump(model, 'sentiment_model.joblib')          # Saves trained model
dump(le, 'label_encoder.joblib')               # Saves label encoder

print("✅ Model saved as 'sentiment_model.joblib'")
print("✅ Label encoder saved as 'label_encoder.joblib'")

✅ Model saved as 'sentiment_model.joblib'
✅ Label encoder saved as 'label_encoder.joblib'


In [14]:

# files.download("preprocessed_dataset.csv")
# files.download("final_tokenized_dataset.csv")
# files.download("training_data.csv")
# files.download("sentence_confidence_report.csv")
# files.download("misclassified_examples.csv")
# files.download("correct_predictions.csv")
# files.download("sentiment_model.joblib")
# files.download("label_encoder.joblib")
files.download('sentimentanalysis_model.ipynb')

FileNotFoundError: Cannot find file: sentimentanalysis_model.ipynb

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder

# # Reload your data (if needed)
# df = pd.read_csv("final_tokenized_dataset.csv")
# df['joined_sentence'] = df['tokenized_sentence'].apply(lambda x: " ".join(eval(x)))

# # Encode sentiment labels
# le = LabelEncoder()
# df['label'] = le.fit_transform(df['sentiment'])

# # TF-IDF with n-grams (1,2)
# vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# X = vectorizer.fit_transform(df['joined_sentence'])
# y = df['label']

# # Logistic Regression Model
# model = LogisticRegression(max_iter=1000)

# # Step 3: 5-Fold Cross-validation
# scores = cross_val_score(model, X, y, cv=5)
# print("Cross-validation Scores:", scores)
# print("Average Accuracy:", scores.mean())


Cross-validation Scores: [0.375      0.5        0.45833333 0.45833333 0.83333333]
Average Accuracy: 0.525
