In [74]:
from google.colab import drive 
drive.mount('/content/drive',force_remount=True) 

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/Web Mining")

In [None]:
!pip install kmodes



In [None]:
!pip install uszipcode



In [None]:
!pip install --upgrade kmodes



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from kmodes.kmodes import KModes
from tqdm import tqdm
from uszipcode import SearchEngine
%matplotlib inline



In [None]:
## Load movies data
movies = pd.read_csv('./movie_final_full_dataset.csv', encoding="ISO-8859-1", skipinitialspace=True)
ratings = pd.read_csv('./ml-1m/ratings.dat', sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'], skipinitialspace=True)
users = pd.read_csv('ml-1m/users.dat', sep="::", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"],skipinitialspace=True) 

  return func(*args, **kwargs)


In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [None]:
users['Gender'] = users['Gender'].astype('category')
users['Age'] = users['Age'].astype('category')
users['Occupation'] = users['Occupation'].astype('category')

In [None]:
users = users.set_index('UserID')
users

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,02460
5,M,25,20,55455
...,...,...,...,...
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060


## Without Using Zip-code

In [38]:
users_copy = users.drop('Zip-code', axis =1)

In [39]:
users_copy

Unnamed: 0_level_0,Gender,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,1,10
2,M,56,16
3,M,25,15
4,M,45,7
5,M,25,20
...,...,...,...
6036,F,25,15
6037,F,45,1
6038,F,56,1
6039,F,45,0


In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1,100)
for num_clusters in tqdm(list(K)):
    kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
    kmode.fit_predict(users_copy)
    cost.append(kmode.cost_)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

In [40]:
# Building the model with 20 clusters
kmode = KModes(n_clusters=20, init = "random", n_init=10 , verbose=1, random_state=42)
clusters = kmode.fit_predict(users_copy)
clusters

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 2588, cost: 4233.0
Run 1, iteration: 2/100, moves: 256, cost: 4233.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 499, cost: 4178.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 443, cost: 4324.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 1328, cost: 4422.0
Run 4, iteration: 2/100, moves: 91, cost: 4422.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 1337, cost: 4382.0
Run 5, iteration: 2/100, moves: 47, cost: 4382.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, iteration: 1/100, moves: 558, cost: 4993.0
Run 6, iteration: 2/100, moves: 9, cost: 4993.0
Init: initializing ce

array([19,  3,  0, ...,  2,  2,  0], dtype=uint16)

In [41]:
users_copy.insert(0, "Cluster", clusters, True)

In [42]:
# Sort users by Cluster number
users_copy = users_copy.sort_values(by = 'Cluster')

In [None]:
users_copy

In [44]:
users_copy['Cluster'].value_counts()

0     1255
1      815
12     534
9      368
2      355
5      322
7      312
8      266
3      251
15     224
13     206
4      205
11     196
19     181
14     172
10     116
6      103
17      66
18      52
16      41
Name: Cluster, dtype: int64

In [45]:
# Function to split groups evenly
def splitList (lst, n):
    it = iter(lst)
    new = [[next(it) for _ in range(n)] for _ in range(len(lst) // n)]

    for i, x in enumerate(it):
        new[i].append(x)

    return new

In [46]:
users_copy.reset_index(level=0, inplace=True)

In [47]:
users_copy

Unnamed: 0,UserID,Cluster,Gender,Age,Occupation
0,6040,0,M,25,6
1,3961,0,M,25,1
2,3966,0,M,25,0
3,3968,0,M,25,2
4,1835,0,M,25,19
...,...,...,...,...,...
6035,3904,19,M,1,10
6036,940,19,M,1,10
6037,2217,19,M,1,10
6038,5063,19,F,1,10


In [57]:
# Splitting clusters into groups and combining list of groups into single list
groupsli = []
for i in users_copy['Cluster'].unique():
  li2=users_copy['UserID'].loc[users_copy['Cluster']==i]
  li3 = splitList(li2,10)
  groupsli.extend(li3)

596


In [70]:
users_copy['Group']=0

In [71]:
# Allocating groups based on index of combined list
for j in groupsli:
  for k in j:
    users_copy['Group'].loc[users_copy['UserID']==k] = groupsli.index(j)+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [72]:
users_copy

Unnamed: 0,UserID,Cluster,Gender,Age,Occupation,Group
0,6040,0,M,25,6,1
1,3961,0,M,25,1,1
2,3966,0,M,25,0,1
3,3968,0,M,25,2,1
4,1835,0,M,25,19,1
...,...,...,...,...,...,...
6035,3904,19,M,1,10,596
6036,940,19,M,1,10,596
6037,2217,19,M,1,10,596
6038,5063,19,F,1,10,596


In [73]:
# Exporting datset
users_copy.to_csv("./Abhay/users_grouped.csv", index=False)

## Using Zip-code 

**We are not using the code from here but keeping it to show that we tried including Zip-code as a feature**

In [None]:
users['Zip-code'] = users['Zip-code'].str[:5]

In [None]:
def get_state(zipC):
  try:
    engine = SearchEngine()
    zipcode = engine.by_zipcode(zipC)
    return zipcode.state
  except:
    return "Others"

In [None]:
users['State'] = users['Zip-code'].apply(get_state)

In [None]:
users = users.drop(['Zip-code'],axis =1)

In [None]:
users['State'] = users['State'].astype('category')

In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1,100)
for num_clusters in tqdm(list(K)):
    kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
    kmode.fit_predict(users)
    cost.append(kmode.cost_)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
users_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6040 entries, 1 to 6040
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Gender      6040 non-null   category
 1   Age         6040 non-null   category
 2   Occupation  6040 non-null   category
dtypes: category(3)
memory usage: 195.1 KB


In [None]:
# Building the model with 18 clusters
kmode = KModes(n_clusters=27, init = "random", n_init = 5, verbose=1)
clusters = kmode.fit_predict(users)
clusters

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 1176, cost: 8661.0
Run 1, iteration: 2/100, moves: 76, cost: 8661.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 951, cost: 8978.0
Run 2, iteration: 2/100, moves: 303, cost: 8931.0
Run 2, iteration: 3/100, moves: 24, cost: 8931.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 789, cost: 9222.0
Run 3, iteration: 2/100, moves: 8, cost: 9222.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 639, cost: 9300.0
Run 4, iteration: 2/100, moves: 18, cost: 9300.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 1042, cost: 8640.0
Run 5, iteration: 2/100, moves: 163, cost: 8623.0
Run 5, iteration: 3/100, moves: 7, cost: 8623.0
Best 

array([11, 17, 19, ...,  6, 22,  5], dtype=uint16)

In [None]:
users.insert(0, "Cluster", clusters, True)

In [None]:
users.sort_values(by = 'Cluster')

Unnamed: 0_level_0,Cluster,Gender,Age,Occupation,State
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2464,0,F,18,4,OH
3202,0,F,18,4,VA
3197,0,F,18,14,CA
3196,0,F,18,4,OH
3891,0,F,56,16,CA
...,...,...,...,...,...
3117,26,M,35,0,ID
4985,26,M,35,0,SC
5045,26,M,35,0,OH
2059,26,M,35,0,OH


In [None]:
users['Cluster'].value_counts()

5     763
3     552
0     495
1     431
18    349
8     343
7     327
10    318
2     309
9     285
4     212
21    205
6     202
11    183
17    166
20    162
22    132
12    102
23     82
26     76
24     71
25     70
15     63
16     45
19     43
13     36
14     18
Name: Cluster, dtype: int64

In [None]:
print(users.loc[users['Cluster']==0])

        Cluster Gender Age Occupation State
UserID                                     
18            0      F  18          3    CA
34            0      F  18          0    MA
38            0      F  18          4    MA
41            0      F  18          4    PA
47            0      M  18          4    CA
...         ...    ...  ..        ...   ...
5959          0      F  18          4    NY
5985          0      F  18          4    TX
5992          0      F  18          4    MD
6028          0      M  18          4    CA
6031          0      F  18          0    OH

[495 rows x 5 columns]


In [None]:
print(users_copy.loc[users_copy['Cluster']==2])

        Cluster Gender Age Occupation
UserID                               
21            2      M  18         16
22            2      M  18         15
25            2      M  18          4
38            2      F  18          4
39            2      M  18          4
...         ...    ...  ..        ...
5992          2      F  18          4
5998          2      M  18          4
6008          2      M  18          4
6027          2      M  18          4
6028          2      M  18          4

[1006 rows x 4 columns]


In [None]:

range_n_clusters = range(1,20)

for n_clusters in range_n_clusters:

    # Initialize the clusterer with n_clusters value and a random generator
    # # seed of 10 for reproducibility.
    # clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    # cluster_labels = clusterer.fit_predict(X)

    clusterer = KModes(n_clusters=n_clusters, init = "random", n_init = 5, verbose=1)
    cluster_labels =  clusterer.fit_predict(users_copy)


    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(users_copy, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # # Compute the silhouette scores for each sample
    # sample_silhouette_values = silhouette_samples(X, cluster_labels)

    # y_lower = 10
    # for i in range(n_clusters):
    #     # Aggregate the silhouette scores for samples belonging to
    #     # cluster i, and sort them
    #     ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

    #     ith_cluster_silhouette_values.sort()

    #     size_cluster_i = ith_cluster_silhouette_values.shape[0]
    #     y_upper = y_lower + size_cluster_i

    #     color = cm.nipy_spectral(float(i) / n_clusters)
    #     ax1.fill_betweenx(
    #         np.arange(y_lower, y_upper),
    #         0,
    #         ith_cluster_silhouette_values,
    #         facecolor=color,
    #         edgecolor=color,
    #         alpha=0.7,
    #     )

    #     # Label the silhouette plots with their cluster numbers at the middle
    #     ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    #     # Compute the new y_lower for next plot
    #     y_lower = y_upper + 10  # 10 for the 0 samples

    # ax1.set_title("The silhouette plot for the various clusters.")
    # ax1.set_xlabel("The silhouette coefficient values")
    # ax1.set_ylabel("Cluster label")

    # # The vertical line for average silhouette score of all the values
    # ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    # ax1.set_yticks([])  # Clear the yaxis labels / ticks
    # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # # 2nd Plot showing the actual clusters formed
    # colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    # ax2.scatter(
    #     X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    # )

    # # Labeling the clusters
    # centers = clusterer.cluster_centers_
    # # Draw white circles at cluster centers
    # ax2.scatter(
    #     centers[:, 0],
    #     centers[:, 1],
    #     marker="o",
    #     c="white",
    #     alpha=1,
    #     s=200,
    #     edgecolor="k",
    # )

    # for i, c in enumerate(centers):
    #     ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    # ax2.set_title("The visualization of the clustered data.")
    # ax2.set_xlabel("Feature space for the 1st feature")
    # ax2.set_ylabel("Feature space for the 2nd feature")

    # plt.suptitle(
    #     "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
    #     % n_clusters,
    #     fontsize=14,
    #     fontweight="bold",
    # )

# plt.show()

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 10934.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 10934.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 10934.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 10934.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 10934.0
Best run was number 1


  estimator=estimator,


ValueError: ignored

In [None]:
# Building the model with 3 clusters
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)
clusters = kmode.fit_predict(data)
clusters