In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [3]:
df = pd.read_csv("E:\\nlp_project\\sentiment_analysis.csv", encoding="ISO-8859-1", )

In [4]:
df.shape

(800000, 6)

In [5]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user @ references
    text = re.sub(r'\@\w+','', text)
    
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text


In [6]:
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)

In [7]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['target'], 
                                                    test_size=0.2, random_state=42)

In [9]:
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Total dataset size: {X_train.shape[0] + X_test.shape[0]}")


Training samples: 640000
Test samples: 160000
Total dataset size: 800000


In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [11]:
print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")


Shape of X_train_tfidf: (640000, 10000)
Shape of X_test_tfidf: (160000, 10000)


In [9]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0),
    'SVM': LinearSVC(C=1.0, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [10]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name} Model:")
    
    # Train the model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    # Store the results
    results[name] = {
        'model': model,
        'accuracy': accuracy
    }
    
    # Print classification report
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))


Training Naive Bayes Model:
Naive Bayes Accuracy: 0.7677
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

           0       0.77      0.76      0.77     79960
           1       0.76      0.78      0.77     80040

    accuracy                           0.77    160000
   macro avg       0.77      0.77      0.77    160000
weighted avg       0.77      0.77      0.77    160000


Training Logistic Regression Model:
Logistic Regression Accuracy: 0.7816
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.79      0.76      0.78     79960
           1       0.77      0.80      0.79     80040

    accuracy                           0.78    160000
   macro avg       0.78      0.78      0.78    160000
weighted avg       0.78      0.78      0.78    160000


Training SVM Model:
SVM Accuracy: 0.7810
Classification Report for SVM:
              precision    recall  f1-score   support

