# Import Library

In [None]:
import pandas as pd
import numpy as np
import re
#pd.set_option('display.max_rows', None)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score,v_measure_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as sm
import math

import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

# Product Clustering

In [None]:
# K Means Clustering Function
class Kmeans:
  """ K Means Clustering
  
  Parameters
  -----------
      k: int , number of clusters
      
      seed: int, will be randomly set if None
      
      max_iter: int, number of iterations to run algorithm, default: 200
      
  Attributes
  -----------
      centroids: array, k, number_features
      
      cluster_labels: label for each data point
      
  """
  
  def __init__(self, k, seed = None, max_iter = 200):
      self.k = k
      self.seed = seed
      if self.seed is not None:
          np.random.seed(self.seed)
      self.max_iter = max_iter
      
  def initialise_centroids(self, data):
      """Randomly Initialise Centroids
      
      Parameters
      ----------
      data: array or matrix, number_rows, number_features
      
      Returns
      --------
      centroids: array of k centroids chosen as random data points 
      """
      
      initial_centroids = np.random.permutation(data.shape[0])[:self.k]
      self.centroids = data[initial_centroids]

      return self.centroids
  
  def assign_clusters(self, data):
    """Compute distance of data from clusters and assign data point
        to closest cluster.
    
    Parameters
    ----------
    data: array or matrix, number_rows, number_features
    
    Returns
    --------
    cluster_labels: index which minmises the distance of data to each
    cluster
        
    """
    
    if data.ndim == 1:
        data = data.reshape(-1, 1)
    
    dist_to_centroid =  pairwise_distances(data, self.centroids, metric = 'euclidean')
    self.cluster_labels = np.argmin(dist_to_centroid, axis = 1)
    
    return  self.cluster_labels


  def update_centroids(self, data):
    """Computes average of all data points in cluster and
        assigns new centroids as average of data points
    
    Parameters
    -----------
    data: array or matrix, number_rows, number_features
    
    Returns
    -----------
    centroids: array, k, number_features
    """
    
    self.centroids = np.array([data[self.cluster_labels == i].mean(axis = 0) for i in range(self.k)])
    
    return self.centroids

  def predict(self, data):
      """Predict which cluster data point belongs to
      
      Parameters
      ----------
      data: array or matrix, number_rows, number_features
      
      Returns
      --------
      cluster_labels: index which minmises the distance of data to each
      cluster
      """
      
      return self.assign_clusters(data)

  def fit_kmeans(self, data):
      """
      This function contains the main loop to fit the algorithm
      Implements initialise centroids and update_centroids
      according to max_iter
      -----------------------
      
      Returns
      -------
      instance of kmeans class
          
      """
      self.centroids = self.initialise_centroids(data)
      
      # Main kmeans loop
      for iter in range(self.max_iter):

          self.cluster_labels = self.assign_clusters(data)
          self.centroids = self.update_centroids(data)          
          #if iter % 100 == 0:
              #print("Running Model Iteration %d " %iter)
      #print("Model finished running")
      return self    

In [None]:
# define functions    
def preprocess_text(data, column):
  # remove stopwords and lowering case from specific column

  global stopwords
  data[column] = data[column].str.lower().apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))
  return data

def get_letters(data,column):
  # Get letters only from specific column
  data["letters"] = data[column].str.replace(r'[^a-zA-Z]', ' ')
  return data["letters"]

def get_letters_and_numbers(data,column):
  # Get letters and numbers only from specific column

  data["letters_and_numbers"] = data[column].str.replace(r'[^a-zA-Z0-9]', ' ')
  return data["letters_and_numbers"]

def get_brand_name(data, column):
  # Get brand name

  global list_brand
  preprocess_text(data, column)
  data["brand"] = data[column].str.lower().apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords])).str.findall(list_brand).str.join(" ")
  data["brand"] = data["brand"].str.split(' ', 1,expand=True)
  return data

def convert_to_tf_idf_array(data):
  # convert words to tf idf array

  tf_idf_vectorizor = TfidfVectorizer(max_features = 20000, token_pattern=r'\b\w+\b')
  tf_idf = tf_idf_vectorizor.fit_transform(data)
  tf_idf_norm = normalize(tf_idf)
  tf_idf_array = tf_idf_norm.toarray()
  tf_idf_array_df = pd.DataFrame(tf_idf_array, columns=tf_idf_vectorizor.get_feature_names())
  return tf_idf_array_df

def get_top_3_words(tf_idf_array_df):
  # get top 3 words from tf idf array sort by tf idf scores

  tf_idf_array_df.insert(0, 'temp1', 0)
  tf_idf_array_df.insert(0, 'temp2', 0)
  tf_idf_array_df.insert(0, 'temp3', 0)

  top_words = tf_idf_array_df.apply(lambda s: s.abs().nlargest(3).index.tolist(), axis=1)
  top_words = pd.DataFrame(top_words, columns=["top_words"])
  top_words["top_words"] = top_words["top_words"].str.join(" ").str.replace(r'temp1|temp2|temp3', ' ')

  return top_words

def get_top_5_words(tf_idf_array_df):
  # get top 5 words from tf idf array sort by tf idf scores

  tf_idf_array_df.insert(0, 'temp4', 0)
  tf_idf_array_df.insert(0, 'temp5', 0)

  top_words_5 = tf_idf_array_df.apply(lambda s: s.abs().nlargest(5).index.tolist(), axis=1)
  top_words_5 = pd.DataFrame(top_words_5, columns=["top_words_5"])
  top_words_5["top_words_5"] = top_words_5["top_words_5"].str.join(" ").str.replace(r'temp1|temp2|temp3|temp4|temp5', ' ')

  return top_words_5

def get_unique_top_words(data,column):
  # get top 3 words
  unique_top_words = data[column].str.split(' ', 3, expand=True)
  unique_top_words2 = "_".join(np.unique(unique_top_words[[0, 1, 2]].values))

  return unique_top_words2

def price_clustering(data, column, k):
  # price clustering

  data["temp"] = 0
  data_price_array = data[["temp",column]].values
  scaler = MinMaxScaler()

  X_scaled=scaler.fit_transform(data_price_array)

  k_model = KMeans(k)
  k_model.fit(X_scaled)

  predicted_values = k_model.predict(X_scaled)

  predicted_values = pd.DataFrame(predicted_values, columns=["cluster_price"])
  return predicted_values

def price_clustering_label(data, price_column_name, cluster_column_name):
  # get price cluster label based on min and max price
  cluster_min = data.groupby([cluster_column_name]).agg({price_column_name:'min'}).reset_index()
  cluster_min.columns=[cluster_column_name,"min"]
  cluster_max = data.groupby([cluster_column_name]).agg({price_column_name:'max'}).reset_index()
  cluster_max.columns=[cluster_column_name,"max"]
  cluster_label = pd.merge(cluster_min,cluster_max,on=cluster_column_name)
  cluster_label["price_cluster_label"] = cluster_label["min"].astype("str") + " to " + cluster_label["max"].astype("str") 

  data = data.merge(cluster_label,on=cluster_column_name)
  return data

def product_clustering(data, column_product, column_price, k):
    # get product cluster by product name and price
    scaler = MinMaxScaler()
    data["temp"] = 0
    data_price_array = data[["temp",column_price]].values

    data_product = data[column_product]
    tf_idf_vectorizor = TfidfVectorizer(max_features = 20000, token_pattern=r'\b\w+\b')
    tf_idf = tf_idf_vectorizor.fit_transform(data_product)
    tf_idf_norm = normalize(tf_idf)
    tf_idf_array = tf_idf_norm.toarray()

    data_array = np.concatenate((data_price_array, tf_idf_array), axis=1)
    pd.DataFrame(data_array).to_csv("data_array.csv")

    X_scaled=scaler.fit_transform(data_array)

    k_model = KMeans(k)
    k_model.fit(X_scaled)

    predicted_values = k_model.predict(X_scaled)
    
    predicted_values = pd.DataFrame(predicted_values, columns=["cluster_product"])
    return predicted_values

# Function to find distance
# https://www.geeksforgeeks.org/perpendicular-distance-
# between-a-point-and-a-line-in-2-d/
def calc_distance(x1, y1, a, b, c):
  d = abs((a * x1 + b * y1 + c)) / (math.sqrt(a * a + b * b))
  return d

In [None]:
def get_optimum_k(data, column_product, column_price):
  scaler = MinMaxScaler()

  data["temp"] = 0
  data_price_array = data[["temp",column_price]].values

  data_product = data[column_product]
  tf_idf_vectorizor = TfidfVectorizer(max_features = 20000, token_pattern=r'\b\w+\b')
  tf_idf = tf_idf_vectorizor.fit_transform(data_product)
  tf_idf_norm = normalize(tf_idf)
  tf_idf_array = tf_idf_norm.toarray()

  data_array = np.concatenate((data_price_array, tf_idf_array), axis=1)

  X_scaled=scaler.fit_transform(data_array)
  
  max_range = int(len(data)/2)
  dist_points_from_cluster_center = []
  if(max_range<=1):
    return 1
  K = range(1,max_range)
  for no_of_clusters in K:
    k_model = KMeans(n_clusters=no_of_clusters)
    k_model.fit(X_scaled)
    dist_points_from_cluster_center.append(k_model.inertia_)
  #plt.plot(K, dist_points_from_cluster_center)

  x = [K[0], K[max_range-2]]
  y = [dist_points_from_cluster_center[0], dist_points_from_cluster_center[max_range-2]]

  # Calculate the coefficients. This line answers the initial question. 
  coefficients = np.polyfit(x, y, 1)

  # Let's compute the values of the line...
  polynomial = np.poly1d(coefficients)
  x_axis = np.linspace(0,max_range-1,100)
  y_axis = polynomial(x_axis)

  # (y1 – y2)x + (x2 – x1)y + (x1y2 – x2y1) = 0
  # https://bobobobo.wordpress.com/2008/01/07/solving-linear-equations-ax-by-c-0/
  a = dist_points_from_cluster_center[0] - dist_points_from_cluster_center[max_range-2]
  b = K[max_range-2] - K[0]
  c1 = K[0] * dist_points_from_cluster_center[max_range-2]
  c2 = K[max_range-2] * dist_points_from_cluster_center[0]
  c = c1 - c2

  cluster_scores = pd.DataFrame()
  distance_of_points_from_line = []
  for k in range(max_range-1):
    distance_of_points_from_line.append(
        calc_distance(K[k], dist_points_from_cluster_center[k], a, b, c))
    dict = { 'k':K[k], 
            "cluster_score":calc_distance(K[k], dist_points_from_cluster_center[k], a, b, c)
    } 
    if(len(cluster_scores)==0):
      cluster_scores = pd.DataFrame(dict,index=[0])
    else:
      cluster_scores = cluster_scores.append(pd.DataFrame(dict,index=[0]))
  max_k = cluster_scores["k"].iloc[cluster_scores['cluster_score'].argmax()]
  #plt.plot(K, distance_of_points_from_line)
  return max_k

In [None]:
# list category, subcategory, and subsubcategory
product = product_curation[(product_curation.products_marketplace_category_name=="Handphone") & (product_curation.products_marketplace_subcategory_name=="Power Bank") & (product_curation.products_marketplace_subsubcategory_name=="Power Bank")]
# product = product_curation[(product_curation.products_marketplace_category_name=="Handphone")]
category = product.groupby(['products_marketplace_category_name','products_marketplace_subcategory_name','products_marketplace_subsubcategory_name']).size().reset_index().rename(columns={0:'count'})
#category = category.sort_values("count",ascending=False)
result = pd.DataFrame()

for x in range(0,len(category)):
  category_label = category["products_marketplace_category_name"].iloc[x] + "_" + category["products_marketplace_subcategory_name"].iloc[x] + "_" + category["products_marketplace_subsubcategory_name"].iloc[x]
  data_filter = product[(product.products_marketplace_category_name==category["products_marketplace_category_name"].iloc[x]) & (product.products_marketplace_subcategory_name==category["products_marketplace_subcategory_name"].iloc[x]) & (product.products_marketplace_subsubcategory_name==category["products_marketplace_subsubcategory_name"].iloc[x])].reset_index()
  print(str(x) + "/" + str(len(category)) + " - " + category_label + " with " + str(len(data_filter)) + " products")

  # get brand name
  data_filter = get_brand_name(data_filter, "products_marketplace_product_name")

  # preprocessing data
  data_filter = preprocess_text(data_filter, "products_marketplace_product_name")

  # get letters and numbers
  data = get_letters_and_numbers(data_filter, "products_marketplace_product_name")

  # convert product name to tf idf array
  tf_idf_array = convert_to_tf_idf_array(data)
  
  # Get top 3 words with highest tf idf score for each product
  top_words = get_top_3_words(tf_idf_array)

  # Get top 3 words with highest tf idf score for each product
  top_words_5 = get_top_5_words(tf_idf_array)

  # price clustering
  #k_cluster = get_optimum_k(data_filter, "products_marketplace_price")
  cluster_price = price_clustering(data_filter, "products_marketplace_price", 10)

  # merge the result
  result_df = pd.concat([data_filter,cluster_price,top_words,top_words_5], axis=1)
  result_df.to_csv("result_df_first.csv")

  # price clustering label
  result_df = price_clustering_label(result_df, "products_marketplace_price", "cluster_price")

  result_df.to_csv("result_df.csv")

  ## loop per brand and price
  brand_price = result_df.groupby(['cluster_price','price_cluster_label','brand']).size().reset_index().rename(columns={0:'count'})
  brand_price = brand_price.sort_values("count",ascending=False)

  ## loop per price
  cluster_result_df = pd.DataFrame()

  for y in range(0,20):
    print(str(y) + "/" + str(len(brand_price)) + " - create cluster for brand "+ brand_price["brand"].iloc[y] + " and price " + str(brand_price["price_cluster_label"].iloc[y]) + " with " + str(brand_price["count"].iloc[y]) + " products" )
    data_filter2 = result_df[(result_df.cluster_price==brand_price["cluster_price"].iloc[y])&(result_df.brand==brand_price["brand"].iloc[y])].reset_index()
    cluster_label = category_label + "_" + brand_price["brand"].iloc[y] + "_" + brand_price["price_cluster_label"].iloc[y]

    if(len(data_filter2)>=3):
        # get cluster by price and product name top 5 words
        k_cluster = get_optimum_k(data_filter2, "top_words_5", "products_marketplace_price")
        cluster_product = product_clustering(data_filter2, "top_words_5", "products_marketplace_price", k_cluster)
        cluster_result = pd.concat([data_filter2,cluster_product], axis=1)

        # get top words from each cluster
        cluster_unique = cluster_product["cluster_product"].unique()
        cluster_label_top_words = pd.DataFrame()

        for z in range(0,len(cluster_unique)):
          cluster_filter = cluster_result[cluster_result.cluster_product==cluster_unique[z]]
          cluster_label_words = get_unique_top_words(cluster_filter,"top_words")

          dict = { 'cluster_product':cluster_unique[z], 
                  "cluster_label_top_words":cluster_label_words
          } 
          if(len(cluster_label_top_words)==0):
            cluster_label_top_words = pd.DataFrame(dict,index=[0])
          else:
            cluster_label_top_words = cluster_label_top_words.append(pd.DataFrame(dict,index=[0]))

        cluster_result = pd.merge(cluster_result,cluster_label_top_words,on="cluster_product")
        cluster_result["cluster_label"] = cluster_label + "_" + cluster_result["cluster_label_top_words"]
    
    # merge and save to csv
    cluster_result = cluster_result[["products_marketplace_category_name","products_marketplace_subcategory_name","products_marketplace_subsubcategory_name","brand","price_cluster_label","transaction_facts_seller_id","letters_and_numbers","products_marketplace_price","top_words","top_words_5","cluster_product","cluster_label"]]
  
    if(len(result)==0):
      result = cluster_result
    else:
      result = result.append(cluster_result)
    result.to_csv("cluster_result_final.csv",index=False)


0/1 - Handphone_Power Bank_Power Bank with 862 products
0/120 - create cluster for brand  and price 3000.0 to 72000.0 with 126 products
