In [27]:
import sys
import csv
import math
import random
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import sklearn.model_selection as model_select
import sklearn.tree as tree
import sklearn.metrics as metrics
import sklearn.cluster as cluster

# Part 2: Cluster Analysis

# Return a pandas dataframe containing the data set that needs to be extracted from the data_file.
# data_file will be populated with the string 'wholesale_customers.csv'.
def read_csv_2(data_file):
    DATA_DIR  = './data/'
    DATA_FILE = data_file
    try:
        rawdata = pd.read_csv(DATA_DIR + DATA_FILE,encoding = 'unicode_escape')
    except IOError as iox:
        print('there was an I/O error trying to open the data file: ' + str( iox ))
        sys.exit()
    data = rawdata.drop(columns=['Channel'])
    data = data.drop(columns = ["Region"])
    return data

# Return a pandas dataframe with summary statistics of the data.
# Namely, 'mean', 'std' (standard deviation), 'min', and 'max' for each attribute.
# These strings index the new dataframe columns. 
# Each row should correspond to an attribute in the original data and be indexed with the attribute name.
def summary_statistics(df):
    all_columns = list(df.head(0))
    length = len(all_columns)
    row_names = all_columns[0:length]
    
    min_ = df.min()
    max_ = df.max()
    mean = df.mean()
    mean = round(mean)
    std  = df.std()
    std = round(std)
    
    df = pd.DataFrame(index = row_names)
    df.insert(0,"mean",mean)
    df.insert(1,"std",std)
    df.insert(2,"min",min_)
    df.insert(3,"max",max_)
    return df

# Given a dataframe df with numeric values, return a dataframe (new copy)
# where each attribute value is subtracted by the mean and then divided by the
# standard deviation for that attribute.
def standardize(df):
    df1 =  df.copy(deep=True)
    all_columns = list(df.head(0))
    length = len(all_columns)
    names = all_columns[0:length]
    
    mean = df.mean()
    std  = df.std()
    
    for i in range(length):
        df1[names[i]] = df[names[i]].map(lambda x: (x-mean[i])/std[i])
        
    return df1

# Given a dataframe df and a number of clusters k, return a pandas series y
# specifying an assignment of instances to clusters, using kmeans.
# y should contain values in the set {0,1,...,k-1}.
def kmeans(df, k):
    X = df.values
  
    km = cluster.KMeans(n_clusters= k)
    km.fit(X)
    labels = km.labels_
    labels = pd.Series(labels.tolist())
    return labels
   
# Given a dataframe df and a number of clusters k, return a pandas series y
# specifying an assignment of instances to clusters, using agglomerative hierarchical clustering.
# y should contain values from the set {0,1,...,k-1}.
def agglomerative(df, k):
    X = df.values
    M = len(X)
    ac = cluster.AgglomerativeClustering( n_clusters=k, linkage='average', affinity='euclidean' )
    ac.fit( X )
    labels = ac.labels_
    labels = pd.Series(labels.tolist())
    return labels

# Given a data set X and an assignment to clusters y
# return the Solhouette score of the clustering.
def clustering_score(X,y):
    #-compute silhouette score
    SC = metrics.silhouette_score(X, y, metric='euclidean' )
    return SC
    
# Perform the cluster evaluation described in the coursework description.
# Given the dataframe df with the data to be clustered,
# return a pandas dataframe with an entry for each clustering algorithm execution.
# Each entry should contain the: 
# 'Algorithm' name: either 'Kmeans' or 'Agglomerative', 
# 'data' type: either 'Original' or 'Standardized',
# 'k': the number of clusters produced,
# 'Silhouette Score': for evaluating the resulting set of clusters.
def cluster_evaluation(df):
    data =[]
    attr =["Algorithm","data","k","Silhouette Score"]
    #for Kmeans with the Original dataset
    for k in [3,5,10] :
        for i in range(10):
            X = df
            y = kmeans(X,k)
            score = clustering_score(X,y)
            entry = {"Algorithm":'Kmeans',"data":"Original","k":k,"Silhouette Score":score}
            data.append(entry)
            
    #for Agglomerative with the Original dataset       
    for k in [3,5,10] :
        X = df
        y = agglomerative(X,k)
        score = clustering_score(X,y)
        entry = {"Algorithm":'Agglomerative',"data":"Original","k":k,"Silhouette Score":score}
        data.append(entry)
        
    #for Kmeans with the Standardized dataset
    for k in [3,5,10] :
        for i in range(10):  
            X = standardize(df)
            y = kmeans(X,k)
            score = clustering_score(X,y)
            entry = {"Algorithm":'Kmeans',"data":"Standardized","k":k,"Silhouette Score":score}
            data.append(entry)
            
    #for Agglomerative with the Standardized dataset       
    for k in [3,5,10] :
        X = standardize(df)
        y = agglomerative(X,k)
        score = clustering_score(X,y)
        entry = {"Algorithm":'Agglomerative',"data":"Standardized","k":k,"Silhouette Score":score}
        data.append(entry)
      
    df = pd.DataFrame(data)
    return df

# Given the performance evaluation dataframe produced by the cluster_evaluation function,
# return the best computed Silhouette score.
def best_clustering_score(rdf):
    best = rdf['Silhouette Score'].max()
    return best
            
# Run some clustering algorithm of your choice with k=3 and generate a scatter plot for each pair of attributes.
# Data points in different clusters should appear with different colors.
def scatter_plots(df):
    
            

In [36]:
df= read_csv_2("wholesale_customers.csv")

# d = summary_statistics(df)
# d
# df1 = standardize(df)

# y = kmeans(df,3)
# # y = agglomerative(df,10)
# X = df.values
# clustering_score(X,y)

eva = cluster_evaluation(df)
eva[ (eva["Algorithm"] == "Agglomerative")]

Unnamed: 0,Algorithm,data,k,Silhouette Score
30,Agglomerative,Original,3,0.745993
31,Agglomerative,Original,5,0.71479
32,Agglomerative,Original,10,0.516688
63,Agglomerative,Standardized,3,0.76758
64,Agglomerative,Standardized,5,0.736779
65,Agglomerative,Standardized,10,0.614384


In [141]:
X = df.values
X

IN = km.inertia_

#-compute silhouette score
SC = metrics.silhouette_score(X, km.labels_, metric='euclidean' )

#-compute calinski-harabasz score
CH = metrics.calinski_harabasz_score(X,km.labels_)
print(' inertia={}  silhouette={}  calinski-harabasz={}'.format(  IN, SC, CH ))

NameError: name 'km' is not defined

In [223]:
entry = {}
entry['age'] = 10
entry['name'] = 'ysm'
# data = {["age":10,'name':"ysm"],["age":10,'name':"ns"]}
data=[]
data.append(entry)
entry = {}
entry['age'] = 25
entry['name'] = 'sdgsd'
# data = {["age":10,'name':"ysm"],["age":10,'name':"ns"]}

data.append(entry)
df = pd.DataFrame(data)
df

Unnamed: 0,age,name
0,10,ysm
1,25,sdgsd


In [229]:
data =[1,2,3]
data1 =[4,52,53]
data = data + data1
data
entry = {"x":data}
entry

{'x': [1, 2, 3, 4, 52, 53]}