In [25]:
import pandas as pd
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [26]:
df = pd.read_csv('sentiment_analysis.csv')
df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [27]:
df["Platform"] = df["Platform"].str.strip()

In [28]:
df.to_csv("cleaned_sentiment_dataset.csv", index=False)

print("✅ Cleaned dataset saved as 'cleaned_sentiment_dataset.csv'")

✅ Cleaned dataset saved as 'cleaned_sentiment_dataset.csv'


In [29]:
nltk.download('stopwords') # common stop words
nltk.download('wordnet')   # WordNet lemmatizer
nltk.download('omw-1.4')   #

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aqeel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aqeel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aqeel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [31]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    words = text.split()  # Split into words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Remove stop words and lemmatize
    return ' '.join(words)  # Join words again

df['clean_text'] = df['text'].apply(preprocess_text)
df[['text', 'clean_text', 'sentiment']].head()

Unnamed: 0,text,clean_text,sentiment
0,What a great day!!! Looks like dream.,great day look like dream,positive
1,"I feel sorry, I miss you here in the sea beach",feel sorry miss sea beach,positive
2,Don't angry me,dont angry,negative
3,We attend in the class just for listening teac...,attend class listening teacher reading slide n...,negative
4,"Those who want to go, let them go",want go let go,negative


In [32]:
X = df['clean_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Training shape:", X_train_vec.shape)
print("Test shape:", X_test_vec.shape)

Training shape: (399, 1093)
Test shape: (100, 1093)


In [33]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_vec, y_train)
lr_pred = lr.predict(X_test_vec)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.44      0.57        27
     neutral       0.65      0.93      0.76        40
    positive       0.82      0.70      0.75        33

    accuracy                           0.72       100
   macro avg       0.76      0.69      0.70       100
weighted avg       0.75      0.72      0.71       100

Logistic Regression Accuracy: 0.72


In [34]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
nb_pred = nb.predict(X_test_vec)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.37      0.53        27
     neutral       0.63      0.90      0.74        40
    positive       0.75      0.73      0.74        33

    accuracy                           0.70       100
   macro avg       0.76      0.67      0.67       100
weighted avg       0.75      0.70      0.68       100

Naive Bayes Accuracy: 0.7


In [35]:
#Support Vector Machine
svm = LinearSVC(class_weight='balanced', random_state=42)
svm.fit(X_train_vec, y_train)
svm_pred = svm.predict(X_test_vec)

print("SVM Classification Report:")
print(classification_report(y_test, svm_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))

SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.52      0.61        27
     neutral       0.77      0.93      0.84        40
    positive       0.82      0.82      0.82        33

    accuracy                           0.78       100
   macro avg       0.78      0.75      0.76       100
weighted avg       0.78      0.78      0.77       100

SVM Accuracy: 0.78


In [36]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(svm, 'svm_model.pkl')

['svm_model.pkl']