In [None]:
# I am turning color name columns into dummy variables so that I can create a classification model 
# I will be classifying whether Anna Wintour, the current editor in chief at Vogue, had created a Vogue cover from
# 1950 to present 
# Anna Wintour's first Vogue issue was November 1988

### Imports

In [2]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import cv2
from collections import Counter
from skimage.color import rgb2lab, deltaE_cie76
import os
import time
from time import sleep
from tqdm import tqdm
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

In [4]:
# placing colors_dict here just for reference 

colors_dict = {'LIGHT GREEN': (150, 249, 123),
 'GREEN': (21, 176, 26),
 'DARK GREEN': (3, 53, 0),
 'LIGHT BLUE': (149, 208, 252),
 'BLUE': (34, 66, 199),
 'DARK BLUE': (0, 3, 91),
 'LIGHT YELLOW': (255, 254, 122),
 'YELLOW': (255, 255, 20),
 'DARK YELLOW': (213, 182, 10),
 'ORANGE': (249, 115, 6),
 'LIGHT RED': (255, 71, 76),
 'RED': (229, 0, 0),
 'DARK RED': (132, 0, 0),
 'LIGHT PURPLE': (191, 119, 246),
 'PURPLE': (126, 30, 156),
 'DARK PURPLE': (53, 6, 62),
 'LIGHT PINK': (255, 209, 223),
 'PINK': (255, 129, 192),
 'DARK PINK': (203, 65, 107)}

### Upload DataFrame

In [4]:
df = pd.read_csv('../images_1950_final.csv')

In [5]:
df.head()

Unnamed: 0,date,month,year,name,r0,g0,b0,f0,r1,g1,...,xkcd6,xkcd7,distance0,distance1,distance2,distance3,distance4,distance5,distance6,distance7
0,1950-01-01,1,1950,19500101.jpg,131,131,120,56603,211,211,...,LIGHT PINK,DARK PINK,98.534258,49.234135,43.289722,85.305334,91.618775,90.60905,64.412732,53.99074
1,1950-02-01,2,1950,19500201.jpg,164,153,126,48448,225,219,...,LIGHT PINK,DARK PINK,97.061836,35.44009,92.276758,47.212287,48.435524,86.469648,29.529646,79.536155
2,1950-02-15,2,1950,19500215.jpg,102,83,52,47713,156,141,...,LIGHT GREEN,LIGHT GREEN,91.815031,89.409172,54.598535,75.696763,86.844689,89.565618,86.815897,86.463865
3,1950-03-01,3,1950,19500301.jpg,13,12,18,46334,197,184,...,LIGHT PINK,DARK PURPLE,45.880279,83.09633,63.584589,87.555697,85.229103,43.428102,50.497525,71.554175
4,1950-03-15,3,1950,19500315.jpg,127,113,108,31224,170,151,...,LIGHT PINK,DARK PURPLE,89.894382,97.190535,73.885046,99.854895,40.509258,86.884981,56.753854,79.536155


In [6]:
df.head()

Unnamed: 0,date,month,year,name,r0,g0,b0,f0,r1,g1,...,xkcd6,xkcd7,distance0,distance1,distance2,distance3,distance4,distance5,distance6,distance7
0,1950-01-01,1,1950,19500101.jpg,131,131,120,56603,211,211,...,LIGHT PINK,DARK PINK,98.534258,49.234135,43.289722,85.305334,91.618775,90.60905,64.412732,53.99074
1,1950-02-01,2,1950,19500201.jpg,164,153,126,48448,225,219,...,LIGHT PINK,DARK PINK,97.061836,35.44009,92.276758,47.212287,48.435524,86.469648,29.529646,79.536155
2,1950-02-15,2,1950,19500215.jpg,102,83,52,47713,156,141,...,LIGHT GREEN,LIGHT GREEN,91.815031,89.409172,54.598535,75.696763,86.844689,89.565618,86.815897,86.463865
3,1950-03-01,3,1950,19500301.jpg,13,12,18,46334,197,184,...,LIGHT PINK,DARK PURPLE,45.880279,83.09633,63.584589,87.555697,85.229103,43.428102,50.497525,71.554175
4,1950-03-15,3,1950,19500315.jpg,127,113,108,31224,170,151,...,LIGHT PINK,DARK PURPLE,89.894382,97.190535,73.885046,99.854895,40.509258,86.884981,56.753854,79.536155


### Resizing and Updating DataFrame

In [7]:
# dropping unnecessary columns (r,g,b values since they are now represented in the color columns)
df_small = df.drop(df.columns[4:7], axis = 1).drop(df.columns[8:11], axis = 1).drop(df.columns[12:15], axis = 1).drop(df.columns[16:19], axis = 1).drop(df.columns[20:23], axis = 1).drop(df.columns[24:27], axis = 1).drop(df.columns[28:31], axis = 1).drop(df.columns[32:35], axis = 1)

In [8]:
df_small.head()

Unnamed: 0,date,month,year,name,f0,f1,f2,f3,f4,f5,...,xkcd6,xkcd7,distance0,distance1,distance2,distance3,distance4,distance5,distance6,distance7
0,1950-01-01,1,1950,19500101.jpg,56603,39176,14503,7215,7099,3956,...,LIGHT PINK,DARK PINK,98.534258,49.234135,43.289722,85.305334,91.618775,90.60905,64.412732,53.99074
1,1950-02-01,2,1950,19500201.jpg,48448,46259,19303,4585,4492,4218,...,LIGHT PINK,DARK PINK,97.061836,35.44009,92.276758,47.212287,48.435524,86.469648,29.529646,79.536155
2,1950-02-15,2,1950,19500215.jpg,47713,37210,11183,9965,9687,8399,...,LIGHT GREEN,LIGHT GREEN,91.815031,89.409172,54.598535,75.696763,86.844689,89.565618,86.815897,86.463865
3,1950-03-01,3,1950,19500301.jpg,46334,33020,27333,11025,5143,3673,...,LIGHT PINK,DARK PURPLE,45.880279,83.09633,63.584589,87.555697,85.229103,43.428102,50.497525,71.554175
4,1950-03-15,3,1950,19500315.jpg,31224,29446,18703,17859,15900,8525,...,LIGHT PINK,DARK PURPLE,89.894382,97.190535,73.885046,99.854895,40.509258,86.884981,56.753854,79.536155


In [9]:
df_small.columns[20:28] # dummy columns 

Index(['xkcd0', 'xkcd1', 'xkcd2', 'xkcd3', 'xkcd4', 'xkcd5', 'xkcd6', 'xkcd7'], dtype='object')

In [10]:
df_colors = df[df_small.columns[12:20]]

In [11]:
df_colors.head()

Unnamed: 0,color0,color1,color2,color3,color4,color5,color6,color7
0,[131 131 120],[211 211 201],[45 45 45],[188 187 175],[100 83 78],[166 164 150],[200 199 191],[164 100 94]
1,[164 153 126],[225 219 207],[132 122 92],[217 210 195],[46 50 81],[195 183 161],[233 227 215],[144 71 54]
2,[102 83 52],[156 141 104],[62 56 42],[183 132 78],[141 108 64],[202 154 97],[204 188 153],[188 173 139]
3,[13 12 18],[197 184 169],[210 196 180],[167 142 128],[131 109 95],[34 31 32],[218 204 189],[85 70 62]
4,[127 113 108],[170 151 138],[198 190 180],[140 136 138],[37 38 43],[188 165 150],[209 200 191],[86 77 76]


In [12]:
df_small = df_small.drop(df_small.columns[12:20], axis = 1)

In [13]:
df_small.shape

(1019, 28)

In [14]:
df_small = pd.get_dummies(data=df_small, columns=['xkcd0', 'xkcd1', 'xkcd2', 'xkcd3', 'xkcd4', 'xkcd5', 'xkcd6', 'xkcd7'])

In [15]:
df_small.shape

(1019, 170)

In [16]:
df_small.head()

Unnamed: 0,date,month,year,name,f0,f1,f2,f3,f4,f5,...,xkcd7_LIGHT GREEN,xkcd7_LIGHT PINK,xkcd7_LIGHT PURPLE,xkcd7_LIGHT RED,xkcd7_LIGHT YELLOW,xkcd7_ORANGE,xkcd7_PINK,xkcd7_PURPLE,xkcd7_RED,xkcd7_YELLOW
0,1950-01-01,1,1950,19500101.jpg,56603,39176,14503,7215,7099,3956,...,0,0,0,0,0,0,0,0,0,0
1,1950-02-01,2,1950,19500201.jpg,48448,46259,19303,4585,4492,4218,...,0,0,0,0,0,0,0,0,0,0
2,1950-02-15,2,1950,19500215.jpg,47713,37210,11183,9965,9687,8399,...,1,0,0,0,0,0,0,0,0,0
3,1950-03-01,3,1950,19500301.jpg,46334,33020,27333,11025,5143,3673,...,0,0,0,0,0,0,0,0,0,0
4,1950-03-15,3,1950,19500315.jpg,31224,29446,18703,17859,15900,8525,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_small[df_small['date'] == '1988-11-01']

# Anna Wintour's first Vogue issue was November 1988 - identifying this issue here in the dataframe

Unnamed: 0,date,month,year,name,f0,f1,f2,f3,f4,f5,...,xkcd7_LIGHT GREEN,xkcd7_LIGHT PINK,xkcd7_LIGHT PURPLE,xkcd7_LIGHT RED,xkcd7_LIGHT YELLOW,xkcd7_ORANGE,xkcd7_PINK,xkcd7_PURPLE,xkcd7_RED,xkcd7_YELLOW
650,1988-11-01,11,1988,19881101.jpg,46181,24451,14528,13634,11341,9384,...,0,0,0,0,0,0,1,0,0,0


In [18]:
df_small['anna'] = np.where(df_small['date'] >= '1988-11-01', 1 , 0)

In [19]:
df_small.tail()

Unnamed: 0,date,month,year,name,f0,f1,f2,f3,f4,f5,...,xkcd7_LIGHT PINK,xkcd7_LIGHT PURPLE,xkcd7_LIGHT RED,xkcd7_LIGHT YELLOW,xkcd7_ORANGE,xkcd7_PINK,xkcd7_PURPLE,xkcd7_RED,xkcd7_YELLOW,anna
1014,2019-03-01,3,2019,20190301.jpg,21820,21014,20015,19990,17962,14957,...,0,0,0,0,0,0,0,0,0,1
1015,2019-04-01,4,2019,20190401.jpg,28661,21332,19714,17191,15244,8503,...,1,0,0,0,0,0,0,0,0,1
1016,2019-05-01,5,2019,20190501.jpg,25246,23911,18817,18701,15907,12096,...,0,0,0,0,0,0,0,0,0,1
1017,2019-06-01,6,2019,20190601.jpg,25348,22883,18987,17939,16886,15400,...,1,0,0,0,0,0,0,0,0,1
1018,2019-07-01,7,2019,20190701.jpg,30315,19394,17941,15180,14962,12245,...,0,0,0,0,0,1,0,0,0,1


In [20]:
df_small.to_csv('df_small.csv')

In [21]:
df_small.columns[20:-1]

Index(['xkcd0_BLUE', 'xkcd0_DARK GREEN', 'xkcd0_DARK PINK',
       'xkcd0_DARK PURPLE', 'xkcd0_DARK RED', 'xkcd0_DARK YELLOW',
       'xkcd0_GREEN', 'xkcd0_LIGHT BLUE', 'xkcd0_LIGHT GREEN',
       'xkcd0_LIGHT PINK',
       ...
       'xkcd7_LIGHT GREEN', 'xkcd7_LIGHT PINK', 'xkcd7_LIGHT PURPLE',
       'xkcd7_LIGHT RED', 'xkcd7_LIGHT YELLOW', 'xkcd7_ORANGE', 'xkcd7_PINK',
       'xkcd7_PURPLE', 'xkcd7_RED', 'xkcd7_YELLOW'],
      dtype='object', length=150)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [23]:
X = df_small[df_small.columns[20:-1]]
y = df_small['anna']

In [24]:
y.value_counts(normalize = True)

0    0.63788
1    0.36212
Name: anna, dtype: float64

In [25]:
X.head()

Unnamed: 0,xkcd0_BLUE,xkcd0_DARK GREEN,xkcd0_DARK PINK,xkcd0_DARK PURPLE,xkcd0_DARK RED,xkcd0_DARK YELLOW,xkcd0_GREEN,xkcd0_LIGHT BLUE,xkcd0_LIGHT GREEN,xkcd0_LIGHT PINK,...,xkcd7_LIGHT GREEN,xkcd7_LIGHT PINK,xkcd7_LIGHT PURPLE,xkcd7_LIGHT RED,xkcd7_LIGHT YELLOW,xkcd7_ORANGE,xkcd7_PINK,xkcd7_PURPLE,xkcd7_RED,xkcd7_YELLOW
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=42)

### Modeling

In [27]:
# random forest 
rf = RandomForestClassifier()

In [28]:
params = {'min_samples_split':[12], 'n_estimators': [1000], 'min_samples_leaf': [5], 
          'max_depth': [10], 'criterion': ['entropy']}

In [29]:
model = rf.fit(X_train, y_train)



In [30]:
model.score(X_train, y_train)

0.9817671809256662

In [31]:
model.score(X_test, y_test)

0.6568627450980392

In [32]:
# random forest is overfit, trying a lower variance model (logistic)

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
# logistic 
logreg = LogisticRegression()

model_log = logreg.fit(X_train, y_train)



In [35]:
model_log.score(X_train, y_train)

0.8092566619915849

In [36]:
model_log.score(X_test, y_test)

0.7254901960784313

In [None]:
# this is still fairly high in variance, but I am happier with this - this shows there is some signifigance to color

In [43]:
# naive bayes 
nb = MultinomialNB()

In [44]:
model_nb = nb.fit(X_train, y_train)

In [45]:
model_nb.score(X_train, y_train)

0.8008415147265077

In [46]:
model_nb.score(X_test, y_test)

0.7254901960784313

In [47]:
from sklearn.metrics import confusion_matrix

In [48]:
predictions = model.predict(X_test)

In [49]:
conf_mat = confusion_matrix(y_test, predictions)

In [50]:
pd.set_option('max_colwidth', 99)

In [51]:
conf_df = pd.DataFrame(conf_mat)

In [52]:
conf_df

Unnamed: 0,0,1
0,172,23
1,61,50


In [53]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [54]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 172
False Positives: 23
False Negatives: 61
True Positives: 50


### Value Counts and Frequencies

In [None]:
df_small = df_small

In [33]:
df_small.shape

(1019, 36)

In [34]:
df_small.describe()

Unnamed: 0,month,year,f0,f1,f2,f3,f4,f5,f6,f7,distance0,distance1,distance2,distance3,distance4,distance5,distance6,distance7
count,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0
mean,6.42002,1980.087341,33806.122669,24481.238469,19303.754661,15996.18842,13300.758587,10958.98528,8513.395486,5784.993131,71.914367,69.24529,70.91515,70.007004,69.047785,69.339314,69.65967,68.675116
std,3.471216,20.476812,9273.086686,4733.963707,3298.507064,3289.987583,3151.934304,3001.22477,2785.350021,2410.128444,20.790895,21.457331,21.366581,20.917644,20.716252,20.316543,20.96101,21.178914
min,1.0,1950.0,18441.0,12535.0,6718.0,4160.0,2175.0,1795.0,1635.0,822.0,13.379088,17.606817,11.045361,8.774964,11.357817,13.56466,20.223748,14.142136
25%,3.0,1962.0,26991.5,21313.0,17390.5,14198.5,11392.5,8965.0,6526.5,4104.5,53.902674,50.631016,53.296341,52.320987,51.429549,52.043251,50.378426,49.056084
50%,6.0,1977.0,31715.0,23752.0,19330.0,16201.0,13639.0,11175.0,8442.0,5471.0,77.051931,73.600272,75.026662,73.742796,72.22188,72.759879,73.498299,72.249567
75%,9.0,1998.0,38535.5,26690.5,21261.5,18141.0,15378.0,13026.5,10383.5,7099.0,88.670739,86.634288,88.141931,87.672686,85.880731,86.052307,86.700633,86.151028
max,12.0,2019.0,71192.0,52755.0,47651.0,47581.0,42332.0,39748.0,36863.0,33527.0,109.498858,113.512114,113.885908,124.173266,115.178123,114.057003,115.264912,117.7837


In [35]:
df_small['xkcd0'].value_counts()

DARK PINK       296
LIGHT PINK      210
DARK PURPLE     172
PINK             91
LIGHT GREEN      59
PURPLE           49
LIGHT BLUE       38
DARK RED         16
DARK GREEN       16
BLUE             15
DARK YELLOW      11
RED              10
LIGHT RED         9
LIGHT YELLOW      9
LIGHT PURPLE      8
GREEN             6
ORANGE            2
YELLOW            2
Name: xkcd0, dtype: int64

In [36]:
df_small['xkcd1'].value_counts()

DARK PINK       293
DARK PURPLE     233
LIGHT PINK      188
LIGHT GREEN      68
PINK             67
PURPLE           40
LIGHT BLUE       29
DARK RED         23
BLUE             18
DARK GREEN       12
DARK YELLOW      10
RED               9
LIGHT RED         8
GREEN             6
LIGHT YELLOW      5
LIGHT PURPLE      5
ORANGE            2
YELLOW            2
DARK BLUE         1
Name: xkcd1, dtype: int64

In [37]:
df_small['xkcd2'].value_counts()

DARK PINK       283
DARK PURPLE     214
LIGHT PINK      193
PINK             76
LIGHT GREEN      73
PURPLE           50
LIGHT BLUE       27
BLUE             20
DARK GREEN       19
DARK RED         18
DARK YELLOW       9
RED               9
LIGHT RED         7
LIGHT YELLOW      6
GREEN             6
LIGHT PURPLE      5
YELLOW            2
ORANGE            1
DARK BLUE         1
Name: xkcd2, dtype: int64