In [1]:
import pandas as pd
import re
import nltk

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Load datasets
df = pd.read_csv('master_twitter_sentiment.csv')
df.head()

#before cleaning
df["sentiment"].value_counts()


Negative      22808
Positive      21109
Neutral       18603
Irrelevant    13162
Name: sentiment, dtype: int64

In [2]:
# Cleaning

df = df[df['sentiment'] != 'Irrelevant']
df = df.dropna()
df.isnull().sum()


def datacleaning(text):
    cleaned_text = re.sub(r"[^\w\s]|[\d]"," ",text)
    cleaned_text = re.sub(r"[_]"," ",cleaned_text)
    return cleaned_text

df["text"] = df["text"].apply(datacleaning)

#after cleaning
df["sentiment"].value_counts()

Negative    22624
Positive    20932
Neutral     18393
Name: sentiment, dtype: int64

In [3]:
df["text"] = df["text"].str.lower()
wordnet = WordNetLemmatizer()

In [6]:
def dataprocessing(text):
    
    words = re.sub('[^ a-zA-Z]',' ', text)
    words = nltk.word_tokenize(text)
    words = [wordnet.lemmatize(word) for word in words if word not in set(stopwords.words('english')) ]
    return " ".join(words)

df["text"] = df["text"].apply(dataprocessing)

In [5]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)


# Convert text data to numerical features using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_features = tfidf_vectorizer.fit_transform(train_data)
test_features = tfidf_vectorizer.transform(test_data)


# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_features, train_labels)

# Make predictions on the test set
test_predictions = classifier.predict(test_features)

# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
test_report = classification_report(test_labels, test_predictions)

print('Test Set Results:')
print(f'Accuracy: {test_accuracy}')
print('Classification Report:\n', test_report)

Test Set Results:
Accuracy: 0.7258272800645682
Classification Report:
               precision    recall  f1-score   support

    Negative       0.71      0.82      0.76      4518
     Neutral       0.75      0.57      0.65      3635
    Positive       0.73      0.76      0.74      4237

    accuracy                           0.73     12390
   macro avg       0.73      0.72      0.72     12390
weighted avg       0.73      0.73      0.72     12390

