# Collection Name Recommendation Engine
The goal is to cluster together collection names, and then given a query return similar collection names.

# Collection Name Recommendation Engine
The goal is to cluster together collection names, and then given a query return similar collection names.

In [56]:
# imports 
import pymongo
import numpy as np
import distance
from sklearn.cluster import AffinityPropagation
import math
import pickle

In [57]:
# Data Collection 
MONGO_STRING = "mongodb+srv://elijah:RXBfXqU8cJhnUHVM@cluster0.2j9gt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
CLIENT = pymongo.MongoClient(MONGO_STRING)
collection = CLIENT.CollectionDB.collections
collection_names = [{"name": doc["name"], "reddit_members": doc["reddit_members"], "twitter_followers": doc["twitter_followers"]} for doc in collection.find({})]

## Clustering
So the goal is to cluster similar collection names together. The plan is to project the words into a feature space and then cluster similar collection names together. I want to use unsupervised learning, to place words that look similar together. 

In order for me to find similarities between two collection names we need to determine a method of calculating how closly together two words are. To calculate the similarity between two words we can look at the minimum number of edits to go from one word to the other. This is also known as the levenshtien distance. 

We will generate a 2d matrix that tells us the levenshtien distance between any two words. 

### Affinity Propagation
Affinitiy Propagation is a clustering algorithm that takes in a matrix of similar words and returns out clusters of words that are similar together. 

In [58]:
# Data Preprocessing 
collection_names = np.array(collection_names)
lev_similarity = -1*np.array([[distance.levenshtein(c1["name"],c2["name"]) + abs(c1["reddit_members"] - c2["reddit_members"]) + abs(c1["twitter_followers"] - c2["twitter_followers"]) for c1 in collection_names] for c2 in collection_names])

In [59]:
# Model Definition 
affprop = AffinityPropagation(affinity="precomputed", damping=0.6, max_iter=400)
affprop.fit(lev_similarity)

AffinityPropagation(affinity='precomputed', damping=0.6, max_iter=400)

In [60]:
for i, cluster_id in enumerate(np.unique(affprop.labels_)):
    cluster_head = collection_names[affprop.cluster_centers_indices_[cluster_id]]
    foo = np.array(collection_names[np.nonzero(affprop.labels_==cluster_id)])
    cluster = np.unique([col["name"] for col in foo])
    cluster_str = ", ".join(cluster)
    print("cluster %i - *%s:* %s" % (i, cluster_head, cluster_str))

cluster 0 - *{'name': 'cryptopunks', 'reddit_members': 760, 'twitter_followers': 31}:* cryptopunks
cluster 1 - *{'name': 'ape-gang', 'reddit_members': 0, 'twitter_followers': 0}:* alan-2020, animalpunk-gang, ape-avatars, ape-dao-legends, ape-dao-remix, ape-gang, ape-nfts, ape-verse, ape-yacht-1, ape8bits, apes-a-crapp-n, bango-man, blink-g, deadpool-panic, degen-gang-22, framergence, hapebeast-gang-4, hapebeast-gang-5, hapegang, happy-2022, i-m-groot, machine-s-learning, ntf-bag, oiledkongz, pegaxypega-1, psyguanas, pxlfangss, rare-cranium-0001, saibagangs, snoop-dogg-gang-1, surreal-man, the-cow-gang, the-wild-apes-gang, token-2476, toucan-gang
cluster 2 - *{'name': 'animetas', 'reddit_members': 0, 'twitter_followers': 0}:* ancientbits, angrymeerkatsnft, animal-s-1, animetas, blackletters, dancing-tetragons, dankpepemarketplace, heavencomputer, ibamsterdam, jadu-jetpack, junkyardmekas, la-espera, maticmike, midnightelves, oh-myfish, onigiri-ya, spacemetao, unique-1s, vinimahand, zombi

In [61]:
print("Number of clustsers: ", len(np.unique(affprop.labels_)))

Number of clustsers:  289


## Evaluation
The problem with this model is that i can't get the model to predict the result since it calculates the levenshtien distance between all the words. This is good for clustering but i want to be able to project the word into a feature space and then predict which cluster it's closest to. As of right now, what i can do is calcualte the levenstien distance to the cluster heads, and then select one which is the closest.  

In [62]:
def predict_word(model, word, reddit_members, twitter_followers, collection_names):
    lev_similarity = np.array([distance.levenshtein(collection_names[model.cluster_centers_indices_[cluster_id]]["name"], word) + abs(collection_names[model.cluster_centers_indices_[cluster_id]]["reddit_members"] - reddit_members) + abs(collection_names[model.cluster_centers_indices_[cluster_id]]["twitter_followers"] - twitter_followers) for cluster_id in np.unique(model.labels_)])
    (index, minimum_lev) = min(enumerate(lev_similarity), key=lambda x:x[1])
    return np.unique([col["name"] for col in collection_names[np.nonzero(model.labels_==np.unique(model.labels_)[index])]])

In [63]:
predict_word(affprop, "foo", 40, 40, collection_names)

array(['tinygoblinclub'], dtype='<U14')

# Conclusion
Using the affinity propagation i was able to cluster together collection names, then using a given word i was able to calculate the levenshtien distance from the cluster heads, finding the most similar cluster and then i displayed the cluster to show relevant collection names.

In [64]:
class PredictionModel:
    def __init__(self, collection_names):
        self.collection_names = np.array(collection_names)
        self.lev_similarity = []
        self.model = None

    def train(self, damping=0.6, max_iter=400):
        print("Computing Similarity Matrix")
        self.lev_similarity = -1*np.array([[
            (distance.levenshtein(c1["name"], c2["name"]) + abs(c1["reddit_members"] - c2["reddit_members"]) + abs(c1["twitter_followers"] - c2["twitter_followers"])) for c1 in self.collection_names] for c2 in self.collection_names])
        print("Starting Model")
        self.model = AffinityPropagation(affinity="precomputed", damping=damping, max_iter=max_iter, verbose=True)
        self.model.fit(self.lev_similarity)
        
    def predict(self, word, reddit_members=0, twitter_followers=0):
        lev_similarity = np.array([
            distance.levenshtein(self.collection_names[self.model.cluster_centers_indices_[cluster_id]]["name"], word) + abs(self.collection_names[self.model.cluster_centers_indices_[cluster_id]]["reddit_members"] - reddit_members) + abs(self.collection_names[self.model.cluster_centers_indices_[cluster_id]]["twitter_followers"] - twitter_followers) for cluster_id in np.unique(self.model.labels_)])
        (index, _) = min(enumerate(lev_similarity), key=lambda x:x[1])
        return np.unique([col["name"] for col in self.collection_names[np.nonzero(self.model.labels_==np.unique(self.model.labels_)[index])]])[:6]

    def __str__(self):
        res = ""
        for i, cluster_id in enumerate(np.unique(self.model.labels_)):
            cluster_head = self.collection_names[self.model.cluster_centers_indices_[cluster_id]]
            dict_cluster = np.array(self.collection_names[np.nonzero(self.model.labels_==cluster_id)])
            cluster = np.unique([col["name"] for col in dict_cluster])
            cluster_str = ", ".join(cluster)
            res += str("cluster %i - *%s:* %s" % (i, cluster_head, cluster_str))
            res += "\n"
        return res

In [65]:
model = PredictionModel(collection_names)

In [68]:
model.train()

Computing Similarity Matrix
Starting Model
Converged after 109 iterations.


In [69]:
with open('../api/collection_model', 'wb') as f:
    pickle.dump(model,f)

In [70]:
with open('../api/collection_model', 'rb') as f:
    y = pickle.load(f)
    
y.predict("ape-age")

array(['alpha-eagles', 'animalpunk-gang', 'ape-avatars',
       'ape-dao-legends', 'ape-dao-remix', 'ape-gang'], dtype='<U18')