In [None]:
import string

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

In [None]:
try:
    nltk.data.find("stopwords")
except LookupError:
    nltk.download("stopwords")

In [None]:
from pandas import DataFrame


def load_and_explore_data(file_path) -> DataFrame | None:
    """
    Loads the tweet analysis data from a CSV file, performs initial exploration,
    and returns a Pandas DataFrame.
    """
    try:
        df = pd.read_csv(file_path, usecols=["Review Text", "Review Color", "Rating"])
        df.columns = ["review", "color", "rating"]
        df = df.dropna()
        df["color"] = df["color"].map({"Black": 0, "White": 1})
        df.info()
        print("\nValue Counts for 'rating':")
        print(df["rating"].value_counts())
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
def clean_text(text):
    """
    Cleans the text data by removing punctuation and common English stopwords.
    Returns a cleaned list of words.
    """
    text = [char for char in text if char not in string.punctuation]
    text = [word for word in text if word.lower() not in stopwords.words("english")]
    return text

In [None]:
def create_features(df):
    """
    Creates "length" columns based on the "review" column.
    """
    df["length"] = df["review"].astype(str).apply(len)
    return df

In [None]:
def create_wordcloud(df, column) -> None:
    """
    Creates WordCloud from given dataframe column.
    """
    single = " ".join(df[column].astype(str).tolist())
    plt.figure(figsize=(10, 10))
    plt.imshow(WordCloud().generate(single))

In [None]:
def extract_text_features(df):
    """
    Extracts text features from the "review" column using CountVectorizer.
    """
    vectorizer = CountVectorizer(analyzer=clean_text)  # Use the clean_text function
    X = vectorizer.fit_transform(df["review"].astype(str))
    return X

In [None]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    """
    Trains a Multinomial Naive Bayes classifier and evaluates its performance.
    """
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
dataframe = load_and_explore_data("path")

In [None]:
create_wordcloud(dataframe, "review")

In [None]:
dataframe = create_features(dataframe)

In [None]:
create_wordcloud(dataframe, "review")

In [None]:
X = extract_text_features(dataframe)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, dataframe["rating"], test_size=0.2, random_state=42, stratify=dataframe["rating"]
)

In [None]:
train_and_evaluate_model(X_train, X_test, y_train, y_test)