In [4]:
import pandas as pd
import numpy as np
import _pickle as pickle

We are going to recluster new profiles

In [5]:
with open('profiles.pkl', 'rb') as f:
    profiles_df = pickle.load(f)

with open('clustered_profiles.pkl', 'rb') as f:
    cluster_df = pickle.load(f)

In [6]:
cluster_df.head()

Unnamed: 0,bios,games,music,movies,jokes,Cluster #
0,Hipster-friendly explorer. Beer trailblazer. I...,4.0,9.0,4.0,4.0,0
1,Lifelong music scholar. Typical zombie evangel...,6.0,5.0,9.0,9.0,0
2,Writer. Friendly twitter scholar. Hardcore rea...,7.0,5.0,2.0,4.0,8
3,Pop culture buff. Reader. Certified gamer. Web...,8.0,9.0,6.0,8.0,7
4,General organizer. Troublemaker. Certified alc...,7.0,7.0,0.0,5.0,0


In [7]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=profiles_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [profiles_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.


In [8]:
new_cluster = profiles_df.append(new_profile)

  new_cluster = profiles_df.append(new_profile)


In [9]:
#import minmaxscaler
from sklearn.preprocessing import MinMaxScaler
#import countVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

In [12]:
## Scaling
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))


## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('bios', axis=1, inplace=True)



In [14]:
from sklearn.decomposition import PCA

In [15]:
## PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

In [22]:
#import tqdm
from tqdm import tqdm
#import AgglomerativeClustering
from sklearn.cluster import AgglomerativeClustering
#import silhouette_score and davies_bouldin_score
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [23]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

100%|██████████| 18/18 [00:52<00:00,  2.94s/it]


In [24]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    
    
print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)


The Silhouette Coefficient Score (find max score):
Max Value:
Cluster #    Cluster Score
2       0.050266

Min Value:
Cluster #    Cluster Score
4       0.028735

The Davies-Bouldin Score (find minimum score):
Max Value:
Cluster #    Cluster Score
4       5.233854

Min Value:
Cluster #    Cluster Score
2       3.510583


In [26]:
## Running HAC
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


## Finding the Exact Cluster for our New Profile
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

In [None]:
## Vectorizing
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v, how='left', lsuffix='_left', rsuffix='_right').drop('bios', axis=1)


## Correlation
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

In [38]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]
print(new_profile)

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

# Displaying the Top 10
profiles_df.loc[top_10_sim.index]

                              bios  games  music  movies  jokes
6825  Lifetime learner. Tech lover      8      1       1      2


Unnamed: 0,bios,games,music,movies,jokes
5318,General coffee practitioner. Twitter nerd. Cer...,9,2,3,2
3235,Food aficionado. Pop cultureaholic. Passionate...,8,1,0,3
4929,General troublemaker. Subtly charming pop cult...,9,1,0,3
4854,General troublemaker. Subtly charming pop cult...,9,2,0,4
2414,Extreme pop culture lover. Music expert. Bacon...,8,2,2,4
5505,Bacon expert. Creator. Introvert. Passionate p...,9,3,2,3
1866,Award-winning communicator. Evil web fanatic. ...,9,2,4,4
4149,Reader. Internet fanatic. Food expert. Profess...,9,3,3,4
4516,Extreme gamer. Pop culture specialist. Unapolo...,9,2,4,3
5471,Alcoholaholic. Coffee lover. Lifelong gamer. M...,7,0,1,1
