In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# Step 1: Load the Dataset
# Replace 'path_to_dataset' with the actual path to the downloaded dataset
df = pd.read_csv('sentiment140.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Display the first few rows of the dataset
print(df.head())

In [None]:
# Step 2: Clean the Text Data
def clean_text(text):
    # Remove special characters, URLs, and mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [None]:
# Step 3: Tokenize and Vectorize the Text
# Use TF-IDF for vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['target'].replace(4, 1)  # Convert target to binary (0 = negative, 1 = positive)

In [None]:
# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Train Classification Models
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# Support Vector Machine (SVM)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [None]:
# Step 6: Evaluate Model Performance
# Naive Bayes Evaluation
print("Naive Bayes:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb)}")
print(f"F1 Score: {f1_score(y_test, y_pred_nb)}")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# SVM Evaluation
print("SVM:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(f"F1 Score: {f1_score(y_test, y_pred_svm)}")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))