In [23]:
# dependencies
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
import cv2
import os 
import requests
from math import floor

In [112]:
# Read in video game data set and get urls
df = pd.read_csv("data.csv")
URLS = list(df['background_image'])
len(URLS)

100000

In [113]:
# Function does the following work flow:
# --------------------------------------
# scrape image
# imsave the image
# read the image with cv2
# do the thing on the image with function
# delete the image with os.remove()
# --------------------------------------

def centroids_from_image_url(url,num_clusters):
    
    # read in our image from online and download to local computer
    r = requests.get(url, allow_redirects=True)
    open("image_from_internet.jpg", 'wb').write(r.content)

    # read in image for processing
    image = cv2.imread("image_from_internet.jpg")
    im_rgb = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    
    # SCALE IMAGE
    mat = cv2.resize(im_rgb, dsize = (60,60))

    # reshape image to pass into K Means 
    img = mat.reshape((mat.shape[0]*mat.shape[1],3))

    # apply kmeans to reshaped image
    km = KMeans(n_clusters = num_clusters)
    km_to_im = km.fit(img)

    # get the clusters
    # labels=list(km.labels_)
    centroids = km.cluster_centers_
    os.remove("image_from_internet.jpg")

    return centroids

In [114]:
# compute centroids column
centroids_col = [centroids_from_image_url(im_url,5) for im_url in URLS]

In [108]:
# split into dominant colors to put into data frame 
color_1 = [color[0] for color in centroids_col]
color_2 = [color[1] for color in centroids_col]
color_3 = [color[2] for color in centroids_col]
color_4 = [color[3] for color in centroids_col]
color_5 = [color[4] for color in centroids_col]
color_1[17]

array([249.87259723, 248.93786321, 248.74608851])

In [109]:
# need to bin by color value 
# red bin
# if red value is largest
# yellow/orange/brown bin
# red and green are both larger than blue
# green bin
# if green value is largest
# blue bin
# if blue value is largest
# black bin
# if all values less than .05
# purple bin
# red and blue are both larger than green
# white bin
# if all values are greater than .95
# grey bin
# if all values are the same 
def rgb2color(color_list):
    n = len(color_list)
    grey_cut_off = 3
    black_cut_off = 60
    white_cut_off = 240
    
    for i in range(n):
        # generalize very common colors first, being black,white,grey
        if floor(color_list[i][0]) < black_cut_off and floor(color_list[i][1]) < black_cut_off and floor(color_list[i][2]) < black_cut_off:
            color_list[i] = "Black"
        elif floor(color_list[i][0]) > white_cut_off and floor(color_list[i][1]) > white_cut_off and floor(color_list[i][2]) > white_cut_off:
            color_list[i] = "White"
        elif floor(color_list[i][2]) == floor(color_list[i][0]) and floor(color_list[i][0]) == floor(color_list[i][2]) and floor(color_list[i][1]) == floor(color_list[i][2]):
            color_list[i] = "Grey"
        elif abs(floor(color_list[i][0])-floor(color_list[i][1])) <= grey_cut_off or abs(floor(color_list[i][0])-floor(color_list[i][2])) <= grey_cut_off or abs(floor(color_list[i][1])-floor(color_list[i][2])) <= grey_cut_off:
            color_list[i] = "Grey"
        
        # generalize primary colors 
        elif floor(color_list[i][1]) > floor(color_list[i][0]) and floor(color_list[i][1]) > floor(color_list[i][2]):
            color_list[i] = "Green/Blue Green/Aqua"
        # elif floor(color_list[i][1]) > floor(color_list[i][0]) and floor(color_list[i][1]) > floor(color_list[i][2]):
        #     color_list[i] = "Blue Green/Aqua"
        elif floor(color_list[i][0]) > floor(color_list[i][2]) and floor(color_list[i][1]) > floor(color_list[i][2]) and floor(color_list[i][0]) > floor(color_list[i][1]):
            color_list[i] = "Orange/Yellow/Brown"
        elif floor(color_list[i][0]) > floor(color_list[i][1]) and floor(color_list[i][0]) > floor(color_list[i][2]):
            color_list[i] = "Red/Pink"
        elif floor(color_list[i][0]) > floor(color_list[i][1]) and floor(color_list[i][2]) > floor(color_list[i][1]):
            color_list[i] = "Purple"
        elif floor(color_list[i][2]) > floor(color_list[i][0]) and floor(color_list[i][2]) > floor(color_list[i][1]):
            color_list[i] = "Blue"
        
    return color_list

In [110]:
color_1_string = rgb2color(color_1)
color_2_string = rgb2color(color_2)
color_3_string = rgb2color(color_3)
color_4_string = rgb2color(color_4)
color_5_string = rgb2color(color_5)


In [111]:

top_5_colors_df = pd.DataFrame({'Color_1':color_1_string,'Color_2':color_2_string,'Color_3':color_3_string,'Color_4':color_4_string,'Color_5':color_5_string})
d = top_5_colors_df.values
np.unique(d)

array(['Black', 'Blue', 'Green/Blue Green/Aqua', 'Grey',
       'Orange/Yellow/Brown', 'Purple', 'Red/Pink', 'White'], dtype=object)