In [66]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np

df = pd.read_csv('DATA/Barbies_Data.csv')

In [60]:
#drop column 'Unnamed: 0', which is a duplicate of index
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [55]:
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anastasiyaayala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text.lower())
    words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)


In [61]:
df

Unnamed: 0,Title,Product #,Released,Lowest,Average,Highest,URL,processed_text,topic
0,Cinderella,16900,1996,1,19.0,55,https://www.barbiedb.com/wp-content/uploads/20...,cinderella,3
1,2019 Holiday,FXF01,2019,9,26.0,67,https://www.barbiedb.com/wp-content/uploads/20...,2019 holiday,5
2,The X-Files Giftset,19630,1998,11,28.0,57,https://www.barbiedb.com/wp-content/uploads/20...,x file giftset,2
3,Winter in New York Barbie,19429,1998,4,26.0,106,https://www.barbiedb.com/wp-content/uploads/20...,winter new york barbi,1
4,Holiday Visions Winter Fantasy Barbie,B2519,2003,1,24.0,98,https://www.barbiedb.com/wp-content/uploads/20...,holiday vision winter fantasi barbi,3
...,...,...,...,...,...,...,...,...,...
1105,Barbie Cali Girl Pacific Beach Pool Playset,B5205,2003,45,,60,https://www.barbiedb.com/wp-content/uploads/20...,barbi cali girl pacif beach pool playset,2
1106,Barbie Cali Girl Lea Doll,G8666,2005,25,,60,https://www.barbiedb.com/wp-content/uploads/20...,barbi cali girl lea doll,1
1107,Barbie Cali Girl Doll,68971,2004,10,,29,https://www.barbiedb.com/wp-content/uploads/20...,barbi cali girl doll,1
1108,Barbie Cali Girl Christie Doll,C6462,2004,25,,38,https://www.barbiedb.com/wp-content/uploads/20...,barbi cali girl christi doll,1


In [58]:
df['processed_text'] = df['Title'].apply(preprocess_text)

# Step 2: Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])

# Step 3: Topic Modeling (using Gensim's LDA)
from gensim import corpora, models

dictionary = corpora.Dictionary(df['processed_text'].apply(lambda x: x.split()))
corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
lda_model = models.LdaModel(corpus, num_topics=6, id2word=dictionary, passes=10)

# Step 4: Assigning Labels
df['topic'] = [max(lda_model[text], key=lambda item: item[1])[0] for text in corpus]






In [63]:
topic2 = df[df['topic'] == 2]

In [64]:
topic2.head(40)

Unnamed: 0,Title,Product #,Released,Lowest,Average,Highest,URL,processed_text,topic
2,The X-Files Giftset,19630,1998,11,28.0,57,https://www.barbiedb.com/wp-content/uploads/20...,x file giftset,2
7,40th Anniversary Barbie,21384,1999,1,48.0,301,https://www.barbiedb.com/wp-content/uploads/20...,40th anniversari barbi,2
14,Barbie Sign Language,25837,1999,17,301.0,73,https://www.barbiedb.com/wp-content/uploads/20...,barbi sign languag,2
18,Evening Sophisticate Barbie,19361,1998,10,8.0,70,https://www.barbiedb.com/wp-content/uploads/20...,even sophist barbi,2
19,Water Lily Barbie,17783,1997,23,8.0,101,https://www.barbiedb.com/wp-content/uploads/20...,water lili barbi,2
25,NASCAR 50th Anniversary Barbie,20442,1998,1,9.0,29,https://www.barbiedb.com/wp-content/uploads/20...,nascar 50th anniversari barbi,2
28,50th Golden Anniversary of Mattel Barbie,14479,1995,15,9.0,200,https://www.barbiedb.com/wp-content/uploads/20...,50th golden anniversari mattel barbi,2
37,Harpist Angel Barbie,18894,1998,9,21.0,100,https://www.barbiedb.com/wp-content/uploads/20...,harpist angel barbi,2
40,Barbie 30th Anniversary Francie,14608,1996,22,42.0,70,https://www.barbiedb.com/wp-content/uploads/20...,barbi 30th anniversari franci,2
58,Kool-Aid Wacky Warehouse Barbie,11763,1994,1,28.0,31,https://www.barbiedb.com/wp-content/uploads/20...,kool aid wacki warehous barbi,2


<span style="color: #E0218A; font-size: 30px;"> Although it seems like ML model did very general categorization, it is easier to see certain patters in 'Barbie" titles. There is also overlap in categories.</span>

In [170]:
#create a new column for Barbies categories
df['category'] = np.nan

In [566]:
#create categories based on most common words
holiday = ['holiday','birthday',  'holidays', 'christmas', 'valentine', 'winter', 'new year', 'Ornament', 'Halloween']
ken = ['ken']
careers = ['careers', 'career', 'doctor','teacher', 'nurse', 'hostess','trainer', 'Photographer', 'dentist', 'babysitter', 'engineer', 'Entrepreneur', 'sitter','Barbie I Can Be','president','gymnast','model','Travel Agent', 'Stewardess','Video Girl','Music Star','Officer','Military Officer','Army Officer','Political','Public Service','Veterinarian','Astronaut','Transportation','Arts','Designer','Film director','Movie','actress','Hairdresser','Pet','TV Chef','Rockstar','Chef','Business','Sport','Football','Figure','Pediatrician','Harpist','Singer','Soccer',]
fashion_beauty = ['fashion_beauty','fashion','fashions',  'hair','elegance','Armani','pajama','Stylin','styling','salon','beauty','Corduroy','Ralph Lauren', 'chic','Wardrobe', 'dress', 'style', 'outfit']
vacation = ['vacation','beach', 'getaway','summer', 'Cruise'] 
horoscopes = ['horoscopes', 'aries', 'taurus', 'gemini', 'cancer', 'leo', 'virgo', 'libra', 'scorpio', 'sagittarius', 'capricorn', 'aquarius', 'pisces']
nationalities = [ 'nationalities', 'mexican', 'indian', 'chinese', 'japanese', 'brazilian', 'french', 'italian', 'australian', 'egyptian',
    'south african', 'russian', 'canadian', 'german', 'spanish', 'british', 'argentinian', 'chilean', 'colombian',
    'peruvian', 'venezuelan', 'greek', 'turkish', 'emirati', 'korean', 'malaysian', 'filipino', 'singaporean',
    'thai', 'vietnamese', 'new zealander', 'mexico', 'india', 'china', 'japan', 'brazil', 'france', 'italy', 'australia', 'egypt', 'south africa',
    'russia', 'canada', 'germany', 'spain', 'united kingdom', 'argentina', 'chile', 'colombia', 'peru', 'venezuela',
    'greece', 'turkey', 'united arab emirates', 'korea', 'malaysia', 'philippines', 'singapore', 'thailand',
    'vietnam', 'new zealand','Czechoslovakian','polish', 'Scottish', 'Native American', 'African American','Joie de Vivre Barbie']
special_occasion = ['special_occasion', 'ball', 'anniversary','Celebration', 'gala','Renaissance','occasion', 'party','wedding', 'Evening','silver','platinum', 'bride','Pepsi','']
animals = ['pets','animals', 'dog','horse', 'pony','pup', 'kittens','pet']
family_friends = ['family_friends', 'sister', 'sisters','friend', 'friends']
victorian = ['victorian']
celebrities = ['celebrities','Marilyn Monroe', 'Audrey Hepburn', 'Elizabeth Taylor', 'Grace Kelly', 'Beyoncé', 'Zendaya', 'Ava DuVernay', 'Ella Fitzgerald', 'Ibtihaj Muhammad', 'Yara Shahidi', 'Billie Jean King', 'Amelia Earhart', 'Florence Nightingale', 'Susan B. Anthony', 'Katherine Johnson', 'Ella Toone', 'Leyla Piedayesh']
california_girl = ['california_girl','cali','california']
movie_show_chars = ['movie_show_chars', 'disney', 'mermaid','cinderella','x-files','hunger games', 'wizard','nutcracker','twilight', 'princess', 'wonder woman', 'Hansel & Gretel', 'three bears', 'supergirl','fairy','Snow White','Juliet','Arabian Nights','Merliah','Pegasus','Three Musketeers', 'Three Musketeer', 'Alexa','The Secret Door', 'Curious George','Rebelde TV Barbie Doll','Daria Celebutante','Barbie Batgirl','Midge Wedding Day','Venetian Opulence','Benefit Performance Barbie']
cars = ['vehicles','Vehicle','cars', 'convertible', 'jeep','Converible','Speedboat','Microbus', 'Cadillac']
hobby_activity = ['hobby_activity','Keyboard','bike','Camping','Workin’ Out']
furniture = ['furniture','vanity', 'refrigerator', 'clock', 'appliance', 'closet', 'Kitchen','Bath','Telephone']
dreamhouse = ['houses','dreamhouse', 'house']
city_girl = ['city_girl','Shanghai', 'Parisienne', 'milan','new york']
cheerleader = ['cheerleader','Cheerleader']
famous = ['famous','music_artist', 'Diana Ross', 'Mia Hamm']
dining = ['dining', 'picnic', 'dinner','cafe', 'Yogurt', 'Ice Cream','Café']
royals =['royals','royal','Empress']
sets = ['sets','set']




In [567]:
categories = [holiday, ken, careers,fashion_beauty,vacation, horoscopes, nationalities, special_occasion, animals, family_friends, victorian, celebrities, california_girl, movie_show_chars, cars,hobby_activity, furniture, dreamhouse, city_girl, cheerleader,famous, dining,royals, sets]

In [568]:
for category in categories:
    filter = df['Title'].apply(lambda x: any(substring.lower() in x.lower() for substring in category))
    df.loc[filter, 'category'] = category[0]


In [569]:
df.groupby('category')['category'].count().sum()

931

In [570]:
#barbies that still need to be categorized
missing = df[df['category'].isna()]

In [571]:
missing.tail(60)

Unnamed: 0.1,Unnamed: 0,Title,Product #,Released,Lowest,Average,Highest,URL,category
482,482,Bedtime Baby Barbie Doll And Krissy Doll,56616,2002,41,,70,https://www.barbiedb.com/wp-content/uploads/20...,
484,484,Inuit Legend Barbie,G8892,2005,150,,500,https://www.barbiedb.com/wp-content/uploads/20...,
498,498,Barbie Secret Messages Computer Center,67275,2000,70,,95,https://www.barbiedb.com/wp-content/uploads/20...,
499,499,Barbie My Scene Snow Glam Chelsea Doll,L9340,2008,22,,45,https://www.barbiedb.com/wp-content/uploads/20...,
501,501,Barbie Mariposa Doll,Y6372,2013,25,,40,https://www.barbiedb.com/wp-content/uploads/20...,
512,512,Live Action Barbie Christie on Stage,1175,1971,160,,310,https://www.barbiedb.com/wp-content/uploads/20...,
513,513,Barbie Walking Jamie,1132,1970,450,,650,https://www.barbiedb.com/wp-content/uploads/20...,
517,517,Barbie My Scene Goes Hollywood Nolee Doll,G6132,2005,40,,55,https://www.barbiedb.com/wp-content/uploads/20...,
518,518,Barbie Midge’s Ensemble,1012,1964,450,,525,https://www.barbiedb.com/wp-content/uploads/20...,
524,524,Barbie Doll Accessories,923,1961,200,,250,https://www.barbiedb.com/wp-content/uploads/20...,
