In [None]:
'''
Code copied from this source as recommended by Ebay official since current version of scikit-learn does not have pair confusion matrix
https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/cluster/_supervised.py
'''
import numpy as np
from sklearn.metrics.cluster._supervised import contingency_matrix, check_clusterings
def pair_confusion_matrix(labels_true, labels_pred):
    """Pair confusion matrix arising from two clusterings.
    The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix
    between two clusterings by considering all pairs of samples and counting
    pairs that are assigned into the same or into different clusters under
    the true and predicted clusterings.
    Considering a pair of samples that is clustered together a positive pair,
    then as in binary classification the count of true negatives is
    :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is
    :math:`C_{11}` and false positives is :math:`C_{01}`.
    Read more in the :ref:`User Guide <pair_confusion_matrix>`.
    Parameters
    ----------
    labels_true : array-like of shape (n_samples,), dtype=integral
        Ground truth class labels to be used as a reference.
    labels_pred : array-like of shape (n_samples,), dtype=integral
        Cluster labels to evaluate.
    Returns
    -------
    C : ndarray of shape (2, 2), dtype=np.int64
        The contingency matrix.
    See Also
    --------
    rand_score: Rand Score
    adjusted_rand_score: Adjusted Rand Score
    adjusted_mutual_info_score: Adjusted Mutual Information
    Examples
    --------
    Perfectly matching labelings have all non-zero entries on the
    diagonal regardless of actual label values:
      >>> from sklearn.metrics.cluster import pair_confusion_matrix
      >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0])
      array([[8, 0],
             [0, 4]]...
    Labelings that assign all classes members to the same clusters
    are complete but may be not always pure, hence penalized, and
    have some off-diagonal non-zero entries:
      >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1])
      array([[8, 2],
             [0, 2]]...
    Note that the matrix is not symmetric.
    References
    ----------
    .. L. Hubert and P. Arabie, Comparing Partitions, Journal of
      Classification 1985
      https://link.springer.com/article/10.1007%2FBF01908075
    """
    labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
    n_samples = np.int64(labels_true.shape[0])

    # Computation using the contingency data
    contingency = contingency_matrix(
        labels_true, labels_pred, sparse=True
        )#, dtype=np.int64)
    n_c = np.ravel(contingency.sum(axis=1))
    n_k = np.ravel(contingency.sum(axis=0))
    sum_squares = (contingency.data ** 2).sum()
    C = np.empty((2, 2), dtype=np.int64)
    C[1, 1] = sum_squares - n_samples
    C[0, 1] = contingency.dot(n_k).sum() - sum_squares
    C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
    C[0, 0] = n_samples ** 2 - C[0, 1] - C[1, 0] - sum_squares
    return C

In [1]:
import pandas as pd
akshit_df = './mlchallenge_set_2021.tsv'
akshit_valid = './mlchallenge_set_validation.tsv'
sam_df = 'C:/Users/sjmal/OneDrive/Desktop/ML/2021/mlchallenge_set_2021_edited.txt'
sam_valid = 'C:/Users/sjmal/OneDrive/Desktop/ML/2021/mlchallenge_set_validation.tsv'
SA_valid=pd.read_table('/Users/shivankagrawal/Documents/ebay/mlchallenge_set_validation.tsv',header=None)
SA_df=pd.read_table('/Users/shivankagrawal/Documents/ebay/mlchallenge_set_2021.tsv',header=None)
df=SA_df
valid=SA_valid 
#df = pd.read_table(akshit_df)
#valid = pd.read_table(akshit_valid,sep='\t')
SA_valid = valid
SA_df = df
df.columns=['category','primary_image_url','All Links','Tags','index']
valid.columns=['ID', 'Group']
valid.set_index('ID', inplace=True)

In [None]:
"""
Import test output file as dataframe and set the index.
"""
output = './akshit.tsv'
predictions = pd.read_table(output,header=None)
predictions.columns = ['ID', 'Group']
predictions.set_index('ID', inplace=True)

In [None]:
print(predictions)

In [None]:
"""
Compute confusion matrix for validation data. Use valid dataframe to extract values from a tsv output file.
"""
preds = []
actuals = []
for index, row in valid.iterrows():
    preds.append(predictions.loc[index]['Group'])
    actuals.append(row['Group'])
# print(preds[100:200])
# print(actuals[100:200])
def evaluate(actuals, preds):
    C = pair_confusion_matrix(actuals, preds)
    precision = C[1][1] / (C[1][1] + C[0][1]) # (true positives) / (true positives + false positives)
    recall = C[1][1] / (C[1][1] + C[1][0]) # (true positives) / (true positives + false negatives)
    print(f'Confusion Matrix:\n{C}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'f1 score: {2 * precision * recall / (precision + recall)}')

evaluate(actuals, preds)

In [None]:
print(df.head())
print(valid.head())

In [2]:
#split links into list (for now only does first 100)
i = 0
for link in df['All Links'][0:100]:
    df['All Links'][i] = link.split(';')
    i+=1
print(df['All Links'][0:100])

0     [https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYY...
1     [https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPI...
2     [https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flI...
3     [http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcA...
4     [https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scs...
                            ...                        
95    [https://i.ebayimg.com/00/s/MTU5OVgxNTgx/z/vMA...
96    [https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/GoA...
97    [https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/Hm0...
98    [https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/VhA...
99    [https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/n4o...
Name: All Links, Length: 100, dtype: object


In [3]:
"""
Frequency counting of the tags in a particular category.
"""
import re
from collections import Counter
# for category in range(1, 6):
df = SA_df
df = df.loc[df['category'] == 5]
freq=Counter()
attribute=[['']]*len(df)
print('allocated array')
trialrange=len(df)
for x in range(0, trialrange, 5):#range(int(len(df)/10)):#len(df)
    attribute[x]=df.iloc[x,3].lower()
    attribute[x] = attribute[x][1:-1] # remove starting and ending parentheses
    attribute[x] = re.split(r',(?![^(]*\))', attribute[x]) # ignore commas that are inside parentheses
    attribute[x] = [a.split(':') for a in attribute[x]]
    freq+=Counter([i[0] for i in attribute[x]])
    tempdict={}
#     print(df.iloc[x,3])
    #print(attribute[x])
#     print(attribute)
    for i in attribute[x]:
        try:
            tempdict[i[0]]=float(i[1])
        except:
            try:
                tempdict[i[0]]=i[1]
            except:
                pass
    attribute[x]=tempdict
#     print(attribute[x])
df = SA_df
print("am out of loop")

allocated array
am out of loop


In [None]:
d = dict((k, v) for k, v in freq.items() if v > 20)
array = list(sorted(d, key=lambda k: d[k],reverse=True))[:10]
d= {k: v for k, v in sorted(d.items(), key=lambda item: item[1],reverse=True)}
print(d)
print(array)
import matplotlib.pyplot as plt
#import matplotlib.pyplot as plt
plt.bar(d.keys(),d.values())
plt.show()

In [None]:
df = SA_df

In [12]:
import re
validation_rows = df.loc[df['index'].isin(valid.index)]
validation_rows = validation_rows.loc[validation_rows['category'] == 1]
c1_rows = df.loc[df['category'] == 1]
c3_rows = df.loc[df['category'] == 3]
c4_rows = df.loc[df['category'] == 4]
keep_tags = ['brand', 'size type', "bottoms size (women's)", 'material', 'inseam', 'color', 'rise', 'style', 'silhouette', 'country/region of manufacture']
keep_tags = ['brand', 'material', 'type', 'pattern', 'model', 'country/region of origin', 'mpn', 'country/region of manufacture', 'handle material', 'number of settings','color']
keep_tags =  ['brand', 'color', 'material', 'type', 'mpn', 'pattern', 'bed size', 'size', 'country/region of manufacture', 'style']
# df = category5
def register_attributes(attribute, all_attributes):
    attribute = attribute[1:-1].lower() # attribute = re.sub(r'[()]','', attribute)
    attribute = re.split(r',(?![^(]*\))', attribute)
    attribute = [a.split(':') for a in attribute]
    for i, a in enumerate(attribute):
        attribute[i] = [s.strip() for s in a]
        if attribute[i][0] in keep_tags:
            all_attributes.add(attribute[i][0])
    #print(f'atttribute is: {attribute}')
    mapping = {}
    #for i in range(len(attribute) - 1):
    #    if i == len(attribute) - 2:
    #        mapping[attribute[i][-1]] = attribute[i + 1][:]
    #    else:
    #        mapping[attribute[i][-1]] = attribute[i + 1][:-1]
    return(attribute)

def map_attributes(attribute, num_attributes, index_to_attr):
    attribute = attribute[1:-1].lower() # attribute = re.sub(r'[()]','', attribute)
    attribute = re.split(r',(?![^(]*\))', attribute)
    attribute = [a.split(':') for a in attribute]
    all_attributes_for_row = [None] * num_attributes
    for i, a in enumerate(attribute):
        attribute[i] = [s.strip() for s in a]
        #print(f'index: {attr_to_index[attribute[i][0]]}')
        if len(attribute[i]) > 1 and attribute[i][0] in keep_tags:
            all_attributes_for_row[attr_to_index[attribute[i][0]]] = attribute[i][1]
    mapping = {}
    #for i in range(len(attribute) - 1):
    #    if i == len(attribute) - 2:
    #        mapping[attribute[i][-1]] = attribute[i + 1][:]
    #    else:
    #        mapping[attribute[i][-1]] = attribute[i + 1][:-1]
    return all_attributes_for_row
m = 2000
all_attributes = set()
all_maps = []
for index,row in c4_rows[0:].iterrows():
    register_attributes(row['Tags'], all_attributes)

all_attributes = list(all_attributes)
attr_to_index = {all_attributes[i]: i for i in range(len(all_attributes))}
#print(attr_to_index)
#print(f'numAttributes: {len(all_attributes)}')

for index,row in c4_rows[0:].iterrows():
    all_maps.append(map_attributes(row['Tags'], len(all_attributes), attr_to_index))
possible_vals = set()
from collections import Counter
c = Counter()
for v in all_maps:
    possible_vals.update(v)
    c.update(v)
print(len(possible_vals))
print(c)

-gry-fl': 1, '1mfsdbfg-wht-tn': 1, 'island monkeys': 1, 'l08': 1, '68 x 93”': 1, 'coconut (white and beige)': 1, 'tommy bahama by sferra': 1, '100% cotton fishbone': 1, 'tropical mosaic': 1, '641022746035': 1, '11000fre-whb-kg': 1, 'shades of pinks': 1, 'morgan jones~~springs mills inc': 1, '50% dacron polyester & 50% cotton': 1, 't250 stripe duvet': 1, 'cover': 1, 'luxurious satin polyester': 1, 'white w/ white embroidery': 1, '11000lfe-whl-kg': 1, '1a38664': 1, 'ms15-002-171-07': 1, 'green blue & pink = silver': 1, 'sander collection': 1, 'oressquw': 1, 'white eyelet lace trim': 1, 'ribbons': 1, 'ch6783pukg': 1, '2642001': 1, '10ctnair-sil-tn': 1, 'full sheet set with pillow cases': 1, 'bright green': 1, 'grand dot': 1, '1a30181': 1, 'lattice insert trim': 1, 'red (damask stripe)': 1, 'solid with double stripe': 1, 'the gordonsbury company': 1, 'blue / pink': 1, 'striped & roses': 1, 'grey stone': 1, 'waterproof mattress pad  twin': 1, '110006sb-bbl-kg': 1, 'fba_vil-6pc-slt-q': 1, 'w

In [13]:
#dataframe with attribute values
#print(type(all_maps))
categories = pd.DataFrame(all_maps)
categories.columns=all_attributes
print(categories)

       bed size  size                 color                   brand  \
0          None  None                  None             home source   
1          crib  None                  gray       pottery barn kids   
2          None  None                  None       organics and more   
3         queen  None                  blue        hotel collection   
4      any size  None        birch grey fan  better homes & gardens   
...         ...   ...                   ...                     ...   
42861      None  full              burgundy      italian collection   
42862      None  None            royal blue               dreamhome   
42863      None  None            gray/white          betsey johnson   
42864      None  None                 green          tribeca living   
42865      None  None  all colors available            devi bedding   

                   mpn         material   style country/region of manufacture  \
0       does not apply             None    None                   

In [14]:
from sklearn.preprocessing import OneHotEncoder
oh = OneHotEncoder()
x = oh.fit_transform(categories.astype(str))
print(x)
print(type(x))

  (0, 241)	1.0
  (0, 836)	1.0
  (0, 3080)	1.0
  (0, 6459)	1.0
  (0, 13611)	1.0
  (0, 16845)	1.0
  (0, 17620)	1.0
  (0, 18041)	1.0
  (0, 19482)	1.0
  (0, 20999)	1.0
  (1, 138)	1.0
  (1, 836)	1.0
  (1, 2307)	1.0
  (1, 7779)	1.0
  (1, 14719)	1.0
  (1, 16845)	1.0
  (1, 17620)	1.0
  (1, 18041)	1.0
  (1, 20513)	1.0
  (1, 20814)	1.0
  (2, 241)	1.0
  (2, 836)	1.0
  (2, 3080)	1.0
  (2, 7573)	1.0
  (2, 14719)	1.0
  :	:
  (42863, 16907)	1.0
  (42863, 17620)	1.0
  (42863, 18041)	1.0
  (42863, 18681)	1.0
  (42863, 21140)	1.0
  (42864, 241)	1.0
  (42864, 836)	1.0
  (42864, 2398)	1.0
  (42864, 8701)	1.0
  (42864, 11509)	1.0
  (42864, 16845)	1.0
  (42864, 17620)	1.0
  (42864, 18041)	1.0
  (42864, 19482)	1.0
  (42864, 20999)	1.0
  (42865, 241)	1.0
  (42865, 836)	1.0
  (42865, 1114)	1.0
  (42865, 5816)	1.0
  (42865, 14719)	1.0
  (42865, 16730)	1.0
  (42865, 17595)	1.0
  (42865, 18026)	1.0
  (42865, 20013)	1.0
  (42865, 20999)	1.0
<class 'scipy.sparse.csr.csr_matrix'>


In [17]:
colorind=all_attributes.index('color')
temp=list(oh.categories_[colorind])#[1339]
simple_colors = {'black':0,'white':0,'red':0,'blue':0,'green':0,'yellow':0,'brown':0,'purple':0,'pink':0,'gray':0,'grey':0}

for key in simple_colors.keys():
    try:
        simple_colors[key]=temp.index(key)
    except:
        print(key)
print(simple_colors)
#need to identify the correct column for colors from categories data frame (to properly map)
#need to go through the CNN csv and fill in info in one hot encoding

{'black': 260, 'white': 3197, 'red': 2549, 'blue': 342, 'green': 1339, 'yellow': 3677, 'brown': 571, 'purple': 2474, 'pink': 2288, 'gray': 1248, 'grey': 1410}


In [None]:
for filenum in range(1,69):
    cnn=pd.read_table('predictions'+str(filenum)+'.csv',header=None)
    cnn.columns=['index','colorarr']
    si = len(cnn)
    for i in range(cnn):
        ind=cnn.iloc[i].loc['index']
        for y in cnn.iloc[i].loc['colorarr']:
            x.insert(ind,simple_color[y[0]],y[1])
            #insert y[1] at onehotencoder matrix (ind,simple_color[y[0]])
    

In [None]:
print(c4_rows.head())

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=200, random_state=0,verbose=True, n_init=1, max_iter = 15).fit(x)

In [None]:
"""
Analysis of K-Means output
"""
count = {}
preds = []
actuals = []
for i, label in enumerate(kmeans.labels_):
    if label not in count:
        count[label] = 0
    count[label] += 1
#     print(f'row {i}\n: {df.iloc[i]}')
    ID = c4_rows.iloc[i]['index']
    if ID in valid.index:
        preds.append(label)
        actual = valid.loc[ID]['Group']
        actuals.append(actual)
#     print(f'ID: {ID}, pred: {label}, actual: {actual}')
print(count)
print(kmeans.inertia_)
print(len(preds))

evaluate(actuals, preds)

In [None]:
from sklearn.cluster import AgglomerativeClustering

def cluster(data):
    clustering = AgglomerativeClustering(compute_distances=True,compute_full_tree = True,distance_threshold = 0.1,n_clusters=None).fit(data)
    print(clustering)
#     print("LABELS")
#     print(clustering.labels_)
#     print(len(clustering.labels_))
#     print(len(set(clustering.labels_)))
#     print("DISTAnCES")
#     print(clustering.distances_)
#     print("num connected components")
#     print(clustering.n_connected_components_)
    # make this better
    # make this work on the entire dataset
    # fix nonetypes
    # don't punish missing attributes, but punish conflicts. how do we encode this?

    return clustering

In [None]:
x = x.toarray() # only run this once

In [None]:
# x *= 10
x

In [None]:
full_clustering = [0] * len(x)
count_so_far = 0
for label in count:
    indices = [i for i in range(x.shape[0]) if kmeans.labels_[i] == label]
    data = [x[i] for i in indices]
    print(f'now clustering group {label} which has {len(data)} items')
    c = cluster(data)
    for i, label in enumerate(c.labels_):
        full_clustering[indices[i]] = label + count_so_far
#     print(c.labels_)
    count_so_far += len(data)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
groups = {}
for i, label in enumerate(full_clustering):
    if label not in groups:
        groups[label] = []
    groups[label].append(i)
groups = {label: groups[label] for label in groups if len(groups[label]) > 1}
print(groups)
for label in groups:
    print(f'GROUP: {label}')
    for item in groups[label]:
        print(validation_rows['Tags'].iloc[[item]].to_string())
    print('-----------------')

In [None]:
print(len(groups))

In [None]:
"""
Evaluate clustering after it has gone through k-means and agglomorative clustering.
"""
p = full_clustering
print(full_clustering)
actuals = []
for i in range(len(p)):
    ID = validation_rows.iloc[i]['index']
    if ID in valid.index:
        actual = valid.loc[ID]['Group']
        actuals.append(actual)
a = actuals # valid['Group'].tolist()
print(len(set(p)))
print(len(p))
evaluate(a, p)

In [None]:
"""
Functions used to display the tags in a nicer manner below.
"""
def get_mapping(attribute):
    attribute = re.sub(r'[()]','', attribute)
#     attribute = re.split(r',', attribute)
#     attribute = [a.split(':') for a in attribute]
    attribute = re.split(r':+', attribute)
    attribute = [a.split(',') for a in attribute]
    '''
    for i, a in enumerate(attribute):
        attribute[i] = [s.strip() for s in a]
        all_attributes.add(attribute[i][0])
    #print(f'atttribute is: {attribute}')
    '''
    mapping = {}
    for i in range(len(attribute) - 1):
        if i == len(attribute) - 2:
            mapping[attribute[i][-1]] = attribute[i + 1][:]
        else:
            mapping[attribute[i][-1]] = attribute[i + 1][:-1]
    return mapping

def mapping_to_string(mapping):
    keys = list(mapping.keys())
    keys.sort()
    return ';'.join([f'{key}:{list(sorted(mapping[key]))}' for key in keys])

In [None]:
"""
Show the items where the two clusterings ,p(redict) and a(ctual), differ.
"""
for i in range(len(a)):
    for j in range(len(p)):
        if i != j:
            if a[i] == a[j] and p[i] != p[j]:
                print("These two items should be in the same group:")
            elif a[i] != a[j] and p[i] == p[j]:
                continue
                print("These two items should not be in the same group")
            else:
                continue
            print(f"tags: {mapping_to_string(get_mapping(validation_rows['Tags'].iloc[[i]].to_string(index=False)))}")
            print(f"urls: {validation_rows['primary_image_url'].iloc[[i]].to_string()}")
            print(f"tags: {mapping_to_string(get_mapping(validation_rows['Tags'].iloc[[j]].to_string(index=False)))}")
            print(f"urls: {validation_rows['primary_image_url'].iloc[[j]].to_string()}")
            print('-------------------')

In [None]:
old_clustering = full_clustering
print(old_clustering)

In [None]:
%matplotlib qt
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
import numpy as np
from PIL import Image
import requests
from io import BytesIO

def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    #response = requests.get(url)
    #img = Image.open(BytesIO(response.content))
    link_labels = [df['Tags'][i] for i in clustering.labels_]
    dendrogram(linkage_matrix, labels = link_labels)
plot_dendrogram(clustering)
plt.show()

In [None]:
import re
from collections import Counter
freq=Counter()
attribute=[['']]*len(df)
trialrange=5000
for x in range(trialrange):#range(int(len(df)/10)):#len(df)
    attribute[x]=df.iloc[x,3].lower()
    attribute[x] = re.sub(r'[()]','', attribute[x])
    attribute[x] = re.split(r',', attribute[x])
    attribute[x] = [a.split(':') for a in attribute[x]]
    freq+=Counter([i[0] for i in attribute[x]])
    tempdict={}
    for i in attribute[x]:

            try:
                tempdict[i[0]]=float(i[1])
            except:
                try:
                    tempdict[i[0]]=i[1]
                except:
                    pass
    attribute[x]=tempdict

df['seg']=attribute
#print(df['seg'])

In [None]:
Brands=[]
Images=[]
Colors = []
color_images = []
print(trialrange)
for i in range(trialrange):
    try:
        #df['seg'].iloc[i]['brand']
        #df['primary_image_url'].iloc[i]
        #print(df['seg'].iloc[i]['brand'])
        #print(df['primary_image_url'].iloc[i])
        if df['seg'].iloc[i]['brand'] == 'nike' or df['seg'].iloc[i]['brand'] == 'adidas':
            Brands.append(df['seg'].iloc[i]['brand'])
            Images.append(df['primary_image_url'].iloc[i])
        if df['seg'].iloc[i]['color'] == 'black' or df['seg'].iloc[i]['color'] == 'white':
            Colors.append(df['seg'].iloc[i]['color'])
            color_images.append(df['primary_image_url'].iloc[i])
    except:
        continue
        #Brands.remove[-1]
        #print('not possible at: ',i)

In [None]:
print(len(Brands))
print(len(Colors))
print(len(color_images))

In [None]:
n = 400
from PIL import Image, ImageFile
import requests
from io import BytesIO
import numpy as np
ImageFile.LOAD_TRUNCATED_IMAGES = True

'''
url = df['primary_image_url'][4]
response = requests.get(url)
#img = Image.open(BytesIO(response.content))
img = Image.open(requests.get(url, stream=True).raw)
img.show()
result = Image.new(img.mode, (1000, 550), (64,64,64))
result.paste(img, (0, 0))
result.show()
print(np.asarray(img).shape)
print(np.asarray(result).shape)
'''
image_array = []
images = []
max_height = 0
max_width = 0
i = 0
for img in color_images[0:n]:
    response = requests.get(img)
    if i%200 == 0:
        print(i)
    i+=1
    img = Image.open(BytesIO(response.content))
    if np.asarray(img).shape[1] > max_width:
        max_width = np.asarray(img).shape[1]
    if np.asarray(img).shape[0] > max_height:
        max_height = np.asarray(img).shape[0]
    images.append(img)
i=0
for img in images:
    if i%200 == 0:
        print(i)
    i+=1
    try:
        img = img.convert('RGB')
        margin = Image.new(img.mode, (max_width, max_height), (64,64,64))
    except:
        img = img.convert('RGB')
        margin = Image.new(img.mode,(max_width,max_height),(64,64,64))
    margin.paste(img, (0, 0))
    image_array.append(np.asarray(margin))

In [None]:

#import sys
#print(sys.version)
#%pip install tensorflow

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.layers import Dropout

model = models.Sequential()
model.add(layers.Conv2D(1, (3, 3), activation='relu', input_shape=(max_width, max_height, 3)))
model.add(layers.MaxPooling2D((4, 4)))
model.add(layers.Flatten())
model.add(Dropout(rate=.8))
#model.add(layers.Dense(4, activation='relu'))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
brand_dict = {}
num = 0
labels = []
for b in Colors:
    if b not in brand_dict:
        brand_dict[b] = num
        num+=1
    labels.append(brand_dict[b])

m = 400
n = round(m*.8)
train_images = np.asarray(image_array[0:n])
test_images = np.asarray(image_array[n:m])
train_labels = np.asarray(labels[0:n])
test_labels = np.asarray(labels[n:m])
print(type(test_images[0]))

In [None]:
history = model.fit(train_images, train_labels, epochs=1, batch_size = 10,
                    validation_data=(test_images, test_labels))