In [2]:
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup as bs
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [130]:
!pip install tensorflow --quiet

In [131]:
!pip install wordcloud --quiet

In [132]:
!pip install matplotlib --upgrade --quiet

### Read in dataset
Data is collected in the data_generation notebook.

In [3]:
data = pd.read_csv('complete.csv').drop(['Unnamed: 0','Unnamed: 0.3','Unnamed: 0.2','Unnamed: 0.1', 'Unnamed: 0.4'], axis = 1)
print(data.shape)
# Create the 'seasonyear' column
data['seasonyear'] = data['year'] + data['season'].map({'spring': 0.0, 'fall': 0.1})
data['id'] = data.index
data.head()

(14551, 5)


Unnamed: 0,season,year,seasonyear,designer,text,id
0,spring,1990,1990.0,azzedine-alaia,An enfilade of stretchy knits in jewel tones w...,0
1,spring,1990,1990.0,romeo-gigli,Romeo Gigli is having a moment. The designer’s...,1
2,fall,1990,1990.1,azzedine-alaia,Alaïa worked with pinstripes and other power m...,2
3,spring,1991,1991.0,azzedine-alaia,"<a href=""https://www.vogue.com/article/bella-h...",3
4,spring,1991,1991.0,comme-des-garcons,"“Comme des Garçons,” Rei Kawakubo told <em>Vog...",4


In [4]:
data = data.fillna('drop')
data = data[data['text']!= 'drop']
data.sort_values('text')

Unnamed: 0,season,year,seasonyear,designer,text,id
5976,spring,2014,2014.0,a-f-vandevorst,"""1998 | 2013,"" A.F. Vandevorst's program notes...",5976
2401,spring,2008,2008.0,zac-posen,"""<em>Days of Heaven</em>, the Shakers, and the...",2401
6133,spring,2014,2014.0,jasmin-shokrian,"""<em>Je pars habiter à Los Angeles,</em>"" read...",6133
1287,spring,2005,2005.0,emilio-pucci,"""<em>Je suis realiste; c'est moi. Je ne suis p...",1287
10475,spring,2019,2019.0,azzedine-alaia,"""<em>The past is clear; the future is obscure....",10475
...,...,...,...,...,...,...
8405,fall,2016,2016.1,no-6,“Your mom could wear it . . . you could wear i...,8405
9571,spring,2018,2018.0,celine,“You’re all I need to get by.” Method Man feat...,9571
7829,spring,2016,2016.0,gareth-pugh,"“You’re so money.” Recall, if you will, that u...",7829
13985,spring,2023,2023.0,marni,“‘Why am I here?’ All the time I’ve been think...,13985


### Text processing

In this step, the text is tokenized, then filtered to exclude html tags, designer names, names of individuals, and stopwords.


In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.parsing import preprocess_string
from unidecode import unidecode

designer_names = []
for name in [designer.split('-') for designer in data['designer'].unique()]:
    for i in range(len(name)):
        designer_names.append(name[i])


In [6]:
data['raw_tokens'] = [word_tokenize(sentence) for sentence in data['text']]
raw_tokens = (data[['id','raw_tokens']].explode('raw_tokens'))
raw_tokens.head()

Unnamed: 0,id,raw_tokens
0,0,An
0,0,enfilade
0,0,of
0,0,stretchy
0,0,knits


In [7]:
# creating list of people (proper nouns) mentioned in all collection reviews
from nltk.tag import StanfordNERTagger
import time
from nltk import pos_tag, word_tokenize

jar = 'stanford/stanford-ner-4.2.0.jar'
model = 'stanford/classifiers/english.all.3class.distsim.crf.ser.gz'

st = StanfordNERTagger(model, jar) 

start = time.time()
text = list(raw_tokens['raw_tokens'].unique())
people = [word[0] for word in st.tag(text) if word[1] == 'PERSON']
end = time.time()
print((end-start)/60)

0.5767670194307963


In [8]:
data = data.drop(columns = ['raw_tokens'])

In [None]:
def preprocess(sentence):
    # clean text
    clean_text = bs(sentence, "html.parser").get_text()
    clean_text = re.sub(r"http\S+", "", clean_text)
    tokens = word_tokenize(clean_text)
    
    # filter out all names of people, punctuation, stopwords
    tokens = [word for word in tokens if word not in people]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = [unidecode(word) for word in tokens]
    tokens = [word for word in tokens if word.lower() not in designer_names]
    preprocessed_tokens = preprocess_string(' '.join(tokens))
    return ' '.join(preprocessed_tokens)

start = time.time()
data['preprocessed_sentences'] = [preprocess(sentence) for sentence in data['text']]
end = time.time()
print(end-start, 'seconds')

In [None]:
print('Preprocessing took',((end-start)/60), 'minutes and', ((end-start)/60-(end-start)//60)*60, 'seconds.')

### Creation of designer information table
- designer name
- total # of collections
- first season
- consistency
- prevalence

In [None]:
designer_freq = pd.DataFrame.from_dict(data['designer'].value_counts().to_dict(), orient = 'index').reset_index()
designer_freq.columns = ['designer', 'collections']
designer_freq.head()

### Creation of calculated metrics
- *consistency*: The total number of collections made by designer/total # of seasons since the designer’s initial season (inclusive of first season)
- *prevalence*: An adapted version of the consistency metric that penalizes designers who have few collections. This was put in place for two reasons - 1) to penalize designers with high consistency values as a result of having only been around for a short period of time (ex: a designer whose first collection was in the most recent season has a consistency value of 1.0) and 2) to further penalize designers who have low consistency and few collections. I created the prevalence formula using a **penalty term**, $α$.
    - α comes in several forms, listed here in order from least to most severe:
        1. $α = \frac{1}{collections^2}$
        2. $α = \frac{1}{collections}$
        3. $α = \frac{1}{\sqrt{collections}}$
        4. $α = \frac{1}{\sqrt[3]{collections}}$
    - This helps to ensure that designers who have only been in the most recent season (consistency = 1) are not weighted equally with designers who have high consistency values after having been around for many years

In my final analysis, I used the third form of $α$, $α = \frac{1}{\sqrt{collections}}$, as it penalized the brand new designers with high consistency values without limiting high prevalence values to exclusively the oldest, most established designers. 

The metric is then calculated as $prevalence = consistency - \frac{1}{\sqrt{collections}}$.

In [None]:
first_seasons = data.groupby('designer')['seasonyear'].min().to_dict()
designer_freq['first_season'] = designer_freq['designer'].map(first_seasons)
total_seasons = len(data['seasonyear'].unique())
sample = designer_freq.copy()

cons = []

prev_1 = []
prev_2 = []
prev_3 = []
prev_4 = []

for designer in designer_freq['designer']:
    first_season = float(designer_freq[designer_freq['designer']==designer]['first_season'].iloc[0])
    seasons_since = len(data['seasonyear'].unique()[data['seasonyear'].unique() >= first_season])
    collections = float(designer_freq[designer_freq['designer'] == designer]['collections'].iloc[0])
    if first_season == 2023.1:
        seasons_since = 1
    elif first_season == 2023.0:
        seasons_since = 2
    cons.append(collections/seasons_since)
    prev_1.append(collections/seasons_since - (1/collections)**2)
    prev_2.append(collections/seasons_since - 1/collections)
    prev_3.append(collections/seasons_since - 1/np.sqrt(collections))
    prev_4.append(collections/seasons_since - 1/collections**(1/3))
    
    
designer_freq['consistency'] = cons
designer_freq['prev_1'] = prev_1
designer_freq['prev_2'] = prev_2
designer_freq['prev_3'] = prev_3
designer_freq['prev_4'] = prev_4

designer_freq.head()

In [None]:
import seaborn as sns
pp = sns.pairplot(data=designer_freq, hue = 'first_season', palette = 'viridis',
                  x_vars=['collections'],
                  y_vars=['consistency','prev_1', 'prev_2', 'prev_3', 'prev_4'], height = 4)
pp.fig.suptitle("Prevalence penalty terms")
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
import numpy as np
designer_freq['prevalence'] = designer_freq['prev_2']
designers = designer_freq[['designer', 'collections', 'first_season', 'consistency', 'prevalence']].copy()

percentile_values = [np.percentile(designer_freq['prevalence'], x) for x in [20, 40, 60, 80, 90]]
class_boundaries = list(percentile_values) + [np.inf]
print(class_boundaries)

def assign_class(value):
    for i, boundary in enumerate(class_boundaries):
        if value <= boundary:
            return i

# Apply assign_class function to calculate class for each designer
designers['class'] = designers['prevalence'].apply(assign_class)


In [None]:
designers.groupby('class')[['first_season', 'collections']].describe()

In [None]:
designers.sort_values('consistency', ascending = False)

In [None]:
designers.sort_values('prevalence', ascending = False)[120:140]

In [None]:
consistency_dict = dict(zip(designers['designer'],designers['consistency']))
prev_dict = dict(zip(designers['designer'],designers['prevalence']))
class_dict = dict(zip(designers['designer'],designers['class']))
data['consistency'] = data['designer'].map(consistency_dict)
data['prevalence'] = data['designer'].map(prev_dict)
data['class'] = data['designer'].map(class_dict)

In [None]:
designers['class'].value_counts()

In [127]:
designers.to_csv('designers.csv')
data.to_csv('collections.csv')