In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import words

In [9]:
# Ensure necessary nltk resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/sambickel-
[nltk_data]     barlow/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [5]:
items_25 = pd.read_csv("/Users/sambickel-barlow/Desktop/PP422/Final Project/items_2025.csv")

In [6]:
pts = items_25['product_title'].tolist()
pts_df = pd.DataFrame(pts, columns=['title'])

In [10]:
stop_words = stopwords.words('english')
english_words = set(words.words())

In [11]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    tokens = [word for word in tokens if word in english_words]  # Remove non-english words
    return " ".join(tokens)

In [12]:
# Apply preprocessing
pts_df['processed'] = pts_df['title'].apply(preprocess_text)

In [13]:
# Convert text to a document-term matrix using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(pts_df['processed'])

In [14]:
# Define gendered words
male_words = {'man','mans','men', 'mens', 'male', 'males', 'masculine', 'boy', 'boys', 'him', 'his', 'he', 'hes'}
female_words = {'woman','womans', 'women', 'womens' 'female','females', 'feminine', 'girl', 'girls', 'her', 'hers', 'she', 'shes'}

In [15]:
english_words.update(male_words)
english_words.update(female_words)

In [16]:
# Function to evaluate topic separation
def evaluate_topics(model, feature_names, male_words, female_words):
    topic_male_scores = []
    topic_female_scores = []
    
    for topic_idx, topic_distribution in enumerate(model.components_):
        word_probs = {feature_names[i]: topic_distribution[i] for i in range(len(feature_names))}
        
        # Compute total probability mass assigned to male and female words
        male_score = sum(word_probs.get(word, 0) for word in male_words)
        female_score = sum(word_probs.get(word, 0) for word in female_words)
        
        topic_male_scores.append(male_score)
        topic_female_scores.append(female_score)

    # Find the topic with highest male score and highest female score
    best_male_topic = np.argmax(topic_male_scores)
    best_female_topic = np.argmax(topic_female_scores)

    # If male and female words are in separate topics, return 1, else 0
    return 1 if best_male_topic != best_female_topic else 0

In [18]:
# Fit LDA Model
n_topics = 50  # Adjust based on optimization
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_topics = lda_model.fit_transform(X)

In [81]:
# Show top words per topic
def display_topics(model, feature_names, num_words=50):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))
        print()

feature_names = vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names)

Topic 0:
water bottle gold tonic macaroni aqua cucumber light essence refreshingly hill nitro start anchor goose burgess release home morocco spring excel mill guinea dining anticolic flying vale ginger raspberry park rabbit ale blank lemon fill trainer pig tiny pink peter sparkling chard bench black libra tippee premium discovery grey baby

Topic 1:
body wash skin gel lotion face sensitive shower expert men icing cream machine hand drained ring mist naked hair care dry junior health asparagus pistachio hardback shaped dove urban roll oil loreal cleanse ready baby glucosamine mid deep tate day super spray koko wonderland creamery gently elegant cleansing parker male

Topic 2:
ham rich pet tequila deluxe litter plush lotus figure brewery disinfectant cat smooth dog quinoa smoked crate especial nose bed toy famous tray vocation blocked panda cushion furniture spray one chile timer large maestro concentrated hygiene zing shopping aurora jute teat wood blind linen medium fresh tote scoop s

In [1]:
# Loading the package
import pyLDAvis
from pyLDAvis import lda_model
# Enable displaying our model in our notebook
pyLDAvis.enable_notebook()

In [19]:
pyLDAvis.lda_model.prepare(
    lda_model = lda_model,
    dtm = X,
    vectorizer = vectorizer,
    # Keep our original sorting of documents
    sort_topics=False
)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
