In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import VotingClassifier

In [4]:
# Ensure necessary nltk resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [5]:
items_25 = pd.read_csv("/Users/sambickel-barlow/Desktop/PP422/Final Project/items_2025.csv")

In [6]:
pts = items_25['product_title'].tolist()
pts_df = pd.DataFrame(pts, columns=['title'])

In [7]:
stop_words = stopwords.words('english')
english_words = set(words.words())

In [8]:
# Define gendered words
male_words = {'man','mans','men', 'mens', 'male', 'males', 'masculine', 'boy', 'boys', 'him', 'his', 'he', 'hes'}
female_words = {'woman','womans', 'women', 'womens' 'female','females', 'feminine', 'girl', 'girls', 'her', 'hers', 'she', 'shes'}

In [9]:
english_words.update(male_words)
english_words.update(female_words)

In [10]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [word for word in tokens if word in english_words]  # Remove non-english words
    return " ".join(tokens)

In [11]:
# Apply preprocessing
pts_df['processed'] = pts_df['title'].apply(preprocess_text)

In [12]:
# Function to check if any word in `keywords` exists in the text
def contains_keyword(text, keywords):
    words = set(text.split())  # Convert text into a set of words
    return int(bool(words & keywords))  # Check for intersection

# Function to remove keywords from text
def remove_keywords(text, keywords):
    words = text.split()  # Convert text to list of words
    filtered_words = [word for word in words if word not in keywords]  # Remove keywords
    return " ".join(filtered_words)  # Join words back into a string


In [13]:
# Apply functions to create new columns
pts_df['male_row'] = pts_df['processed'].apply(lambda x: contains_keyword(x, male_words))
pts_df['female_row'] = pts_df['processed'].apply(lambda x: contains_keyword(x, female_words))

In [14]:
# Create categorical variable
pts_df['gend_cat'] = 0
for i in range(len(pts_df)):
    if (pts_df.loc[i,'female_row'] == 0) & (pts_df.loc[i,'male_row'] == 0):
        pts_df.loc[i,'gend_cat'] = 0
    if (pts_df.loc[i,'female_row'] == 1) & (pts_df.loc[i,'male_row'] == 0):
        pts_df.loc[i,'gend_cat'] = 1
    if (pts_df.loc[i,'female_row'] == 0) & (pts_df.loc[i,'male_row'] == 1):
        pts_df.loc[i,'gend_cat'] = 2
    if (pts_df.loc[i,'female_row'] == 1) & (pts_df.loc[i,'male_row'] == 1):
        pts_df.loc[i,'gend_cat'] = 0

In [15]:
# Remove male/female words from processed data
pts_df['processed_filt'] = pts_df['processed'].apply(lambda x: remove_keywords(remove_keywords(x, male_words), female_words))

In [16]:
# Convert text to a document-term matrix using TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = tfidf_vectorizer.fit_transform(pts_df['processed_filt'])

X_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [17]:
# Set output
y = pts_df['gend_cat']

In [18]:
# Step 2: Apply TruncatedSVD to reduce dimensions (let's keep 100 components for now)
svd = TruncatedSVD(n_components=100, random_state=42)
X_svd = svd.fit_transform(X)

In [19]:
# Check variance explained by each component (useful for tuning n_components)
explained_variance = svd.explained_variance_ratio_
#sorted(explained_variance)

In [20]:
# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=422)

In [21]:
# ChatGPT said this would be a good idea. I need to look into what it is more
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [22]:

# Naïve Bayes hyperparameter tuning
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
nb_params = {'alpha': alphas, 'fit_prior' : [True, False], 'class_prior' : [None, [.90,.05,.05], [.8,.1,.1],[.6,.2,.2]]}

# Logistic Regression hyperparameter tuning
lr_params = {
    'C': [0.1],# 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],# 'saga']
}

# XGBoost hyperparameter tuning. This one takes forever with different params
xgb_params = {
    'learning_rate': [0.2],
    'n_estimators': [100],
    'max_depth': [6]
}

In [23]:
# Set up GridSearchCV for each model
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=10000)
xgb_model = GradientBoostingClassifier()

In [25]:
# GridSearchCV for Naïve Bayes
nb_grid_search = GridSearchCV(nb_model, nb_params, cv=5, n_jobs=-1)
nb_grid_search.fit(X_train_smote, y_train_smote)

In [26]:
# GridSearchCV for Logistic Regression
lr_grid_search = GridSearchCV(lr_model, lr_params, cv=5, n_jobs=-1)
lr_grid_search.fit(X_train_smote, y_train_smote)

In [27]:
# GridSearchCV for XGBoost this one takes awhile
xgb_grid_search = GridSearchCV(xgb_model, xgb_params, cv=5, n_jobs=-1)
xgb_grid_search.fit(X_train_smote, y_train_smote)

KeyboardInterrupt: 

In [270]:
# Print best parameters for each model
print("Best parameters for Naïve Bayes:", nb_grid_search.best_params_)
print("Best parameters for Logistic Regression:", lr_grid_search.best_params_)
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Step 3: Evaluate models
nb_best = nb_grid_search.best_estimator_
lr_best = lr_grid_search.best_estimator_
xgb_best = xgb_grid_search.best_estimator_

Best parameters for Naïve Bayes: {'alpha': 0.1, 'class_prior': None, 'fit_prior': False}
Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200}


In [272]:
# Create a Voting Classifier with soft voting
voting_clf_soft = VotingClassifier(estimators=[
    ('naive_bayes', nb_best),
    ('logistic_regression', lr_best),
    ('xgboost', xgb_best)
], voting='soft')

# Fit the Voting Classifier
voting_clf_soft.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_soft = voting_clf_soft.predict(X_test)


In [273]:
print(f"\n{name} Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_soft):.4f}")
print(classification_report(y_test, y_pred_soft))


XGBoost Model Performance:
Accuracy: 0.9730
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     23297
           1       0.12      0.76      0.21        42
           2       0.27      0.76      0.40       177

    accuracy                           0.97     23516
   macro avg       0.46      0.83      0.53     23516
weighted avg       0.99      0.97      0.98     23516



In [274]:
y_pred_soft2 = voting_clf_soft.predict(X)

In [275]:
pts_df['predictions'] = y_pred_soft2

In [2]:
pd.set_option('display.max_rows',100)
pts_df[pts_df['predictions'] == 1]

NameError: name 'pts_df' is not defined

In [226]:
pts_df[pts_df['predictions'] == 2]

Unnamed: 0,title,processed,male_row,female_row,processed_filt,predictions,gend_cat
4,Bulldog Skincare for Men Original Face Wash,bulldog men original face wash,1,0,bulldog original face wash,2,2
60,Lynx Black Body Spray,lynx black body spray,0,0,lynx black body spray,2,0
115,Venus Embrace 5 Blade Razor Blades,embrace blade razor,0,0,embrace blade razor,2,0
149,ASDA Protective Incontinence Pads NIGHT for Se...,protective incontinence night sensitive,0,0,protective incontinence night sensitive,2,0
191,Venus Swirl Flexiball Womens Razor,swirl razor,0,0,swirl razor,2,0
192,Venus Swirl Women's 5 Blade Razor Blades Refill,swirl blade razor refill,0,0,swirl blade razor refill,2,0
193,Veet Face Hair Removal Kit,face hair removal kit,0,0,face hair removal kit,2,0
197,Colgate Total Advanced Deep Clean Toothpaste,total advanced deep clean,0,0,total advanced deep clean,2,0
234,Mitchum Ultimate Men Clean Control Anti-Perspi...,ultimate men clean control deodorant,1,0,ultimate clean control deodorant,2,2
235,Bulldog Skincare For Men Age Defence Moisturiser,bulldog men age defence,1,0,bulldog age defence,2,2


In [129]:
# Step 6: Print and Plot Confusion Matrix
cm = confusion_matrix(y_test, y_pred_soft)
cm

array([[34311,   736],
       [   26,   201]])

In [90]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
p_grid_NB = {'alpha': alphas, 'fit_prior' : [True, False], 'class_prior' : [None, [.05,.95], [.1,.9],[.2, .8],[.3,.7]]}

NB_cls= MultinomialNB()

grid = GridSearchCV(estimator = NB_cls, param_grid = p_grid_NB, scoring = 'f1', cv = 5)
grid.fit(X_train, y_train)

In [91]:
y_pred = grid.predict(X_test)

In [70]:
# Step 4: Make predictions
y_pred = nb_model.predict(X_test)

In [92]:
# Step 5: Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9939
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     35047
           1       0.57      0.23      0.33       227

    accuracy                           0.99     35274
   macro avg       0.78      0.61      0.66     35274
weighted avg       0.99      0.99      0.99     35274



In [93]:
# Step 6: Print and Plot Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[35008,    39],
       [  175,    52]])

In [73]:
# Step 6: Check most important words for each class
feature_names = tfidf_vectorizer.get_feature_names_out()
log_prob = nb_model.feature_log_prob_

print("\nTop words per class:")
for i, class_label in enumerate(nb_model.classes_):
    top_words = [feature_names[j] for j in log_prob[i].argsort()[-10:]]  # Top 10 words
    print(f"Class {class_label}: {', '.join(top_words)}")


Top words per class:
Class 0: free, sauce, milk, home, white, cheese, original, pack, chocolate, chicken
Class 1: shower, face, sensitive, loreal, sure, gel, dog, expert, deodorant, good
