In [1]:
import time
import re, string
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as pat
import pandas as pd
from collections import Counter
from random import randint

import scipy as sp
import scipy.cluster
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance

from sklearn.cluster import KMeans
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd

#import matplotlib as mpl
import seaborn as sns

import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
%matplotlib inline

matplotlib_colors = ['b', 'g', 'r', 'c', 'm', 'grey', 'y', 'orange', 'pink', 'mediumspringgreen', 'darkcyan', 'darkviolet', 'darkslateblue', 'darkslategrey', 'skyblue', 'lime','indianred','maroon','darkolivegreen','dodgerblue']

# Generating Training and Test Cases

In [2]:
MA_Data = pd.read_pickle('Massachusetts.pkl')

In [3]:
MA_Data_Shuffle = MA_Data.sample(frac=1)
MA_Data_Shuffle = MA_Data_Shuffle.reset_index()
MA_Data_Shuffle = MA_Data_Shuffle.rename(columns = {'index':'original_index'})
MA_Data_Shuffle

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
0,61,MA,CENTRAL MA LAW ENFORCEMENT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2006-09-07
1,203,MA,FRAMINGHAM POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,1998-12-21
2,1005,MA,SOMERVILLE POLICE DEPT,1005-00-856-6885,"RIFLE,5.56 MILLIMETER",1,Each,120.00,D,1.0,2011-09-26
3,630,MA,MILTON POLICE DEPT,5855-01-228-0937,NIGHT VISION GOGGLE,1,Each,3578.00,F,1.0,2012-01-23
4,229,MA,GLOUCESTER POLICE DEPT,5855-01-228-0939,NIGHT VISION GOGGLE,1,Each,6000.00,F,1.0,2012-01-18
5,953,MA,SOMERSET POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2005-08-18
6,737,MA,NORTON POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-04-17
7,558,MA,MILTON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2011-12-29
8,1198,MA,WAREHAM POLICE DEPT,5360-00-992-6652,"SPRING,HELICAL,COMPRESSION",10,Each,0.07,B,3.0,2014-08-21
9,429,MA,MANSFIELD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,1998-12-01


In [4]:
Master_training = MA_Data_Shuffle.sample(round(len(MA_Data_Shuffle)*2/3), replace=False)
Master_training

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
902,1219,MA,WATERTOWN POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1999-12-27
44,683,MA,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-01-17
1189,807,MA,REHOBOTH POLICE,2355-01-553-4634,MINE RESISTANT VEHICLE,1,Each,658000.00,C,1.0,2014-01-30
37,743,MA,NORTON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2011-12-30
27,467,MA,MEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2006-02-21
193,1029,MA,SOUTHBRIDGE POLICE DEPT,1005-00-726-5655,"PISTOL,CALIBER .45,AUTOMATIC",1,Each,58.71,D,1.0,2011-11-30
1057,659,MA,MONSON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1999-04-07
824,1050,MA,STURBRIDGE POLICE DEPT,1005-01-128-9936,"RIFLE,5.56 MILLIMETER",1,Each,749.00,D,1.0,2014-09-22
472,669,MA,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-01-17
22,1161,MA,WAREHAM POLICE DEPT,5360-00-979-3931,"SPRING,HELICAL,COMPRESSION",10,Each,0.27,B,3.0,2013-10-24


In [5]:
Master_test = MA_Data_Shuffle.copy()
for orignal_index in Master_training['original_index']:
    Master_test = Master_test[Master_test.original_index != orignal_index]

Master_test

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
7,558,MA,MILTON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2011-12-29
11,695,MA,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-01-17
12,837,MA,REVERE POLICE DEPT,1240-01-540-3763,TELESCOPE SUBASSEMBLY,11,Each,110.06,D,1.0,2011-10-07
14,1097,MA,TOPSFIELD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2002-09-09
15,57,MA,CENTRAL MA LAW ENFORCEMENT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2006-09-07
16,629,MA,MILTON POLICE DEPT,1240-01-411-1265,"SIGHT,REFLEX",2,Each,315.00,D,1.0,2010-08-19
18,1286,MA,WESTPORT POLICE DEPARTMENT,5855-01-447-8992,"ILLUMINATOR,INFRARED",1,Each,1038.00,D,1.0,2011-07-15
24,707,MA,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-01-17
28,985,MA,SOMERSET POLICE DEPT,1005-00-999-1509,"PIN,FIRING PIN RETAINER",20,Each,0.64,B,3.0,2011-11-09
32,271,MA,GROTON POLICE DEPT,2320-01-371-9577,"TRUCK,UTILITY",1,Each,47023.00,Q,6.0,2012-06-15


In [6]:
testset_1 = Master_test.sample(round(len(MA_Data_Shuffle)*1/12), replace=False)

In [7]:
testset_4 = Master_test.copy()
for orignal_index in testset_1['original_index']:
    testset_4 = testset_4[testset_4.original_index != orignal_index]

In [8]:
testset_2 = testset_4.sample(round(len(MA_Data_Shuffle)*1/12), replace=False)

In [9]:
for orignal_index in testset_2['original_index']:
    testset_4 = testset_4[testset_4.original_index != orignal_index]

In [10]:
testset_3 = testset_4.sample(round(len(MA_Data_Shuffle)*1/12), replace=False)

In [11]:
for orignal_index in testset_3['original_index']:
    testset_4 = testset_4[testset_4.original_index != orignal_index]

In [12]:
testset_1

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
1283,1173,MA,WAREHAM POLICE DEPT,5305-01-484-7075,"SCREW,CAP,HEXAGON HEAD",10,Each,2.90,B,3.0,2013-02-19
971,487,MA,MEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2006-02-16
575,962,MA,SOMERSET POLICE DEPT,5855-01-432-0524,"VIEWER,NIGHT VISION",1,Each,3607.00,F,1.0,2012-01-05
290,282,MA,HADLEY POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1995-02-02
607,486,MA,MEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2006-02-21
1011,381,MA,LAWRENCE POLICE DEPT,1005-00-856-6885,"RIFLE,5.56 MILLIMETER",1,Each,120.00,D,1.0,2011-07-21
894,118,MA,DARTMOUTH POLICE DEPT,2320-01-380-8604,"TRUCK,UTILITY",1,Each,81226.00,C,1.0,2011-06-10
326,455,MA,MAYNARD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2013-02-27
1141,107,MA,CUMMINGTON POLICE DEPT,7021-DS-LAP-TOP2,LAPTOP COMPUTER,4,Each,1125.00,Q,,2012-12-17
764,59,MA,CENTRAL MA LAW ENFORCEMENT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2006-09-07


In [13]:
testset_2

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
249,468,MA,MEDFORD POLICE DEPT,1005-00-179-0300,"RIFLE,7.62 MILLIMETER",1,Each,1278.00,D,1.0,1996-06-04
534,870,MA,REVERE POLICE DEPT,1005-00-726-5655,"PISTOL,CALIBER .45,AUTOMATIC",1,Each,58.71,D,1.0,2011-12-23
619,269,MA,GROTON POLICE DEPT,1005-00-856-6885,"RIFLE,5.56 MILLIMETER",1,Each,120.00,D,1.0,2011-10-19
1138,77,MA,CHELSEA POLICE DEPT,1005-01-128-9936,"RIFLE,5.56 MILLIMETER",1,Each,749.00,D,1.0,2014-09-22
739,224,MA,GLOUCESTER POLICE DEPT,1240-01-411-1265,"SIGHT,REFLEX",20,Each,315.00,D,1.0,2012-03-14
649,338,MA,IPSWICH POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2007-06-21
683,1157,MA,WAREHAM POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2005-12-27
1059,239,MA,GLOUCESTER POLICE DEPT,8415-01-535-0064,"COVERALLS,COMBAT VEHICLE CREWMEMBERS",2,Each,256.98,E,7.0,2012-01-06
94,217,MA,FRANKLIN POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2002-03-19
374,4,MA,ACTON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2000-09-06


In [14]:
testset_3

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
625,528,MA,METHUEN POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1999-12-27
1144,1250,MA,WEST SPRINGFIELD POLICE DEPT,1005-00-179-0300,"RIFLE,7.62 MILLIMETER",1,Each,1278.00,D,1.0,1997-01-21
1245,240,MA,GLOUCESTER POLICE DEPT,8415-01-535-0064,"COVERALLS,COMBAT VEHICLE CREWMEMBERS",10,Each,256.98,E,7.0,2012-01-18
256,385,MA,LAWRENCE POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-04-26
15,57,MA,CENTRAL MA LAW ENFORCEMENT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,2006-09-07
1163,353,MA,IPSWICH POLICE DEPT,1240-01-411-1265,"SIGHT,REFLEX",1,Each,315.00,D,1.0,2012-01-04
182,254,MA,GRAFTON POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2002-11-13
227,872,MA,REVERE POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2007-06-26
720,993,MA,SOMERVILLE POLICE DEPT,1005-00-856-6885,"RIFLE,5.56 MILLIMETER",1,Each,120.00,D,1.0,2011-09-26
140,73,MA,CHELSEA POLICE DEPT,1005-01-128-9936,"RIFLE,5.56 MILLIMETER",1,Each,749.00,D,1.0,2014-09-22


In [15]:
testset_4

Unnamed: 0,original_index,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date
14,1097,MA,TOPSFIELD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2002-09-09
24,707,MA,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2012-01-17
35,573,MA,MILTON POLICE DEPT,5855-01-468-4169,"ILLUMINATOR,INFRARED",1,Each,776.79,D,1.0,2012-12-20
60,340,MA,IPSWICH POLICE DEPT,5985-00-106-6130,ANTENNA,4,Each,861.80,D,1.0,2014-03-11
99,947,MA,SHIRLEY POLICE DEPARTMENT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1999-12-28
125,306,MA,HARWICH POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0,1996-06-26
134,70,MA,CHELSEA POLICE DEPT,1005-01-128-9936,"RIFLE,5.56 MILLIMETER",1,Each,749.00,D,1.0,2014-09-22
138,1016,MA,SOUTHBRIDGE POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0,2011-11-30
153,843,MA,REVERE POLICE DEPT,1005-00-726-5655,"PISTOL,CALIBER .45,AUTOMATIC",1,Each,58.71,D,1.0,2011-12-23
159,830,MA,REVERE POLICE DEPT,1670-01-338-9875,"HOLSTER,HARNESS PAR",4,Each,41.44,B,3.0,2010-08-25


# Clustering Code

In [16]:
def rescale_list(mylist, new_max, new_min, current_max, current_min):
    X_scaled = []
    for X in mylist:
        X_scaled.append(((new_max-new_min)*(X - current_min))/(current_max-current_min)+new_min)
        
    return(X_scaled)

In [17]:
def evaluate_clusters(X,max_clusters):
    error = np.zeros(max_clusters+1)
    error[0] = 0;
    for k in range(1,max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters=k, n_init=20)
        kmeans.fit_predict(X)
        error[k] = kmeans.inertia_
    
    plt.figure(figsize=(20,10))
    plt.plot(range(1,len(error)),error[1:])
    plt.xlabel('Number of clusters', {'fontsize':20})
    plt.xticks(size=20)
    plt.ylabel('Error', {'fontsize':20})
    plt.yticks(size=20)

In [18]:
def sc_evaluate_clusters(X,max_clusters):
    s = np.zeros(max_clusters+1)
    s[0] = 0;
    s[1] = 0;
    for k in range(2,max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters=k, n_init=20)
        kmeans.fit_predict(X)
        s[k] = metrics.silhouette_score(X,kmeans.labels_,metric='euclidean')
        
    plt.figure(figsize=(20,10))
    plt.plot(range(2,len(s)),s[2:])
    plt.xlabel('Number of clusters', {'fontsize':20})
    plt.xticks(size=20)
    plt.ylabel('Silhouette Score', {'fontsize':20})
    plt.yticks(size=20)

In [19]:
def sc_evaluate_clusters_heir(Z,data,max_clusters):
    s = np.zeros(max_clusters+1)
    for k in range(2,max_clusters+1):
        clusters = hierarchy.fcluster(Z, k, criterion='maxclust')
        s[k] = metrics.silhouette_score(data,clusters,metric='euclidean')
 
    plt.figure(figsize=(20,10))
    plt.xlabel('Number of clusters', {'fontsize':20})
    plt.ylabel('Silhouette Score', {'fontsize':20})
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.plot(range(2,len(s)),s[2:])

In [20]:
def sc_evaluate_clusters_gmm(X,max_clusters):
    s = np.zeros(max_clusters+1)
    s[0] = 0;
    s[1] = 0;
    for k in range(2,max_clusters+1):
        gmm = mixture.GaussianMixture(n_components=k, covariance_type='full')
        gmm.fit(X)
        s[k] = metrics.silhouette_score(X,gmm.predict(X),metric='euclidean')
        
    plt.figure(figsize=(20,10))
    plt.plot(range(2,len(s)),s[2:])
    plt.xlabel('Number of clusters', {'fontsize':20})
    plt.xticks(size=20)
    plt.ylabel('Silhouette Score', {'fontsize':20})
    plt.yticks(size=20)

In [21]:
def sc_evaluate_clusters_gmm(X,max_clusters):
    s = np.zeros(max_clusters+1)
    s[0] = 0;
    s[1] = 0;
    for k in range(2,max_clusters+1):
        gmm = mixture.GaussianMixture(n_components=k, covariance_type='full')
        gmm.fit(X)
        s[k] = metrics.silhouette_score(X,gmm.predict(X),metric='euclidean')
        
    plt.figure(figsize=(20,10))
    plt.plot(range(2,len(s)),s[2:])
    plt.xlabel('Number of clusters', {'fontsize':20})
    plt.xticks(size=20)
    plt.ylabel('Silhouette Score', {'fontsize':20})
    plt.yticks(size=20)

In [45]:
def list_to_string(review_list):
    new_list = []
    
    for rest_review in review_list:
        new_list.append(" ".join(str(x) for x in rest_review))
    
    return new_list

# Kmeans

In [41]:
Feature_vector = Master_training.copy()
del Feature_vector['Ship Date']
del Feature_vector['State']
del Feature_vector['original_index']
#del Feature_vector['DEMIL Code']
#del Feature_vector['UI']
#del Feature_vector['NSN']
print(type(Feature_vector['Station Name (LEA)'][0]))
print(type(Feature_vector['Item Name'][0]))
print(type(Feature_vector['Quantity'][0]))
print(type(Feature_vector['UI'][0]))
print(type(Feature_vector['Acquisition Value'][0]))
print(type(Feature_vector['DEMIL Code'][0]))
print(type(Feature_vector['DEMIL IC'][0]))
Feature_vector

<class 'str'>
<class 'str'>
<class 'numpy.int64'>
<class 'str'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.float64'>


Unnamed: 0,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC
902,WATERTOWN POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0
44,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0
1189,REHOBOTH POLICE,2355-01-553-4634,MINE RESISTANT VEHICLE,1,Each,658000.00,C,1.0
37,NORTON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0
27,MEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0
193,SOUTHBRIDGE POLICE DEPT,1005-00-726-5655,"PISTOL,CALIBER .45,AUTOMATIC",1,Each,58.71,D,1.0
1057,MONSON POLICE DEPT,1005-00-589-1271,"RIFLE,7.62 MILLIMETER",1,Each,138.00,D,1.0
824,STURBRIDGE POLICE DEPT,1005-01-128-9936,"RIFLE,5.56 MILLIMETER",1,Each,749.00,D,1.0
472,NEW BEDFORD POLICE DEPT,1005-00-073-9421,"RIFLE,5.56 MILLIMETER",1,Each,499.00,D,1.0
22,WAREHAM POLICE DEPT,5360-00-979-3931,"SPRING,HELICAL,COMPRESSION",10,Each,0.27,B,3.0


In [46]:
yup = ['nice', 'open', 'bar', 'with','great', 'view', 'of', 'the']
yup

['nice', 'open', 'bar', 'with', 'great', 'view', 'of', 'the']

In [47]:
list_to_string(yup)

['nice', 'open', 'bar', 'with', 'great', 'view', 'of', 'the']

In [40]:
#Creates error for kmeans
evaluate_clusters(Feature_vector, 25)

ValueError: could not convert string to float: 'RIFLE,7.62 MILLIMETER'

In [None]:
#Creates Silhouette Score for kmeans
sc_evaluate_clusters(Feature_vector,25) 

In [None]:
# Creates kmeans fit, labeling, and centroids
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=100)
kmeans.fit_predict(Feature_vector)
kmeans_centroids = kmeans.cluster_centers_
kmeans_labels = kmeans.labels_

#stem_data['kmean_cluster'] = kmeans_labels

In [None]:
# Generates Cluster Graph
plt.title('Las Vegas Yelp Restuarants 10 Reviews: colors are clusters assigned by k-means.', {'fontsize':28})
plt.rcParams["figure.figsize"] = (20,20)
plt.xticks(size=20)
plt.yticks(size=20)
cols = [matplotlib_colors[p] for p in kmeans_labels]
plt.scatter(stem_data['rescale_longitude'], stem_data['rescale_latitude'], s=50, c=cols)
plt.scatter(kmeans_centroids[:,0]*10, kmeans_centroids[:,1]*10, s = 250, c='k')
plt.show()

# Hierarchy

In [None]:
#Creates Silhouette Score for hierarchy
Z = hierarchy.linkage(Feature_vector, method='ward', metric='euclidean')
sc_evaluate_clusters_heir(Z,Feature_vector,25)

In [None]:
#Creates Dendogram
plt.figure(figsize=(20,15))
plt.xticks(size=20)
plt.yticks(size=20)
R = hierarchy.dendrogram(Z, p=4, truncate_mode='level', show_leaf_counts=True)

In [None]:
# Creates hierarchy fit and labeling
hier_labels = hierarchy.fcluster(Z, 3, criterion='maxclust')
#stem_data['hier_cluster'] = hier_labels

In [None]:
# Creates hierarchy centroids
hier_groups = stem_data.groupby('hier_cluster')
hier_centroid = np.zeros(shape=(max(hier_labels),102))

for hier_cluster_num, hier_groups_data in hier_groups:
    hier_centroid[hier_cluster_num-1] = hier_groups_data['Feature_vector'].mean()

In [None]:
# Generates Cluster Graph
plt.title('Las Vegas Yelp Restuarants 10 Reviews: colors are clusters assigned by hierarchy.', {'fontsize':28})
plt.rcParams["figure.figsize"] = (20,20)
plt.xticks(size=20)
plt.yticks(size=20)
cols = [matplotlib_colors[p-1] for p in hier_labels]
plt.scatter(stem_data['rescale_longitude'], stem_data['rescale_latitude'], s=50, c=cols)
plt.scatter(hier_centroid[:,0]*10, hier_centroid[:,1]*10, s = 250, c='k')
plt.show()

## GMM

In [None]:
#Creates Silhouette Score for GMM
sc_evaluate_clusters_gmm(Feature_vector,25) 

In [None]:
# Creates gmm fit, labeling, and centroids
gmm = mixture.GaussianMixture(n_components=9, covariance_type='full')
gmm.fit(Feature_vector)

gmm_labels = gmm.predict(Feature_vector)
gmm_centroids = gmm.means_

#stem_data['gmm_cluster'] = gmm_labels

In [None]:
# Generates Cluster Graph
plt.rcParams["figure.figsize"] = (20,20)
plt.title('Las Vegas Yelp Restuarants 10 Reviews: colors are clusters assigned by GMM.', {'fontsize':28})
plt.xticks(size=20)
plt.yticks(size=20)
colors = [matplotlib_colors[p] for p in gmm_labels]
_ = plt.scatter(stem_data['rescale_longitude'], stem_data['rescale_latitude'], color=colors, s=50, alpha=0.8)
plt.scatter(gmm_centroids[:,0]*10, gmm_centroids[:,1]*10, s = 250, c='k')