# Collection Name Recommendation Engine
The goal is to cluster together collection names, and then given a query return similar collection names.

In [2]:
# imports 
import pymongo
import numpy as np
import distance
from sklearn.cluster import AffinityPropagation
import math
import pickle

In [3]:
# Data Collection 
MONGO_STRING = "mongodb+srv://elijah:RXBfXqU8cJhnUHVM@cluster0.2j9gt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
CLIENT = pymongo.MongoClient(MONGO_STRING)
collection = CLIENT.CollectionDB.collections
collection_names = [doc["name"] for doc in collection.find({})]

## Clustering
So the goal is to cluster similar collection names together. The plan is to project the words into a feature space and then cluster similar collection names together. I want to use unsupervised learning, to place words that look similar together. 

In order for me to find similarities between two collection names we need to determine a method of calculating how closly together two words are. To calculate the similarity between two words we can look at the minimum number of edits to go from one word to the other. This is also known as the levenshtien distance. 

We will generate a 2d matrix that tells us the levenshtien distance between any two words. 

### Affinity Propagation
Affinitiy Propagation is a clustering algorithm that takes in a matrix of similar words and returns out clusters of words that are similar together. 

In [31]:
# Data Preprocessing 
collection_names = np.array(collection_names)
lev_similarity = -1*np.array([[distance.levenshtein(c1,c2) for c1 in collection_names] for c2 in collection_names])

In [43]:
# Model Definition 
affprop = AffinityPropagation(affinity="precomputed", damping=0.6, max_iter=400)
affprop.fit(lev_similarity)

AffinityPropagation(affinity='precomputed', convergence_iter=15, copy=True,
                    damping=0.6, max_iter=400, preference=None, verbose=False)

In [50]:
for i, cluster_id in enumerate(np.unique(affprop.labels_)):
    cluster_head = collection_names[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(collection_names[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print("cluster %i - *%s:* %s" % (i, cluster_head, cluster_str))

cluster 0 - *ape-gang:* animalpunk-gang, ape-gang, ape-nfts, ape-yacht-1, apesofneon, bango-man, caredino, casarugface, fanggangnft, framergence, grillzgang, hapegang, happy-2022, hok-main, latte-lounge, ntf-bag, rare-cranium-0001, saibagangs, saitamainu, the-cow-gang, the-wild-apes-gang, toucan-gang
cluster 1 - *deadheads:* bearxlabs, bofadeeznuts, bottleheads, creaturetoadz, dead-human, deadfellaz, deadheads, deadline-design, deadpool-panic, demonspawn, deneural, drunkendreams, dsfgdasgfdfsg, eth-heads, hashmasks, kuddle-koalas, lizard-lads, lysergiclabsacidheadz, midnightelves, octohedz, toddthetoad, trixels-heads, veefriends, wearetheoutkast, weedheadz, womenandweapons
cluster 2 - *sipherianflash:* 3d-african-masks, simplefi-badass-lisbeth, simulationplayers, sipherianflash, sipheriansurge, siphrian-flash-nft, siplahrian-flash, slpherian-fiash-neko, swingerpineapples
cluster 3 - *golden-token-crypto-new-yorkers:* golden-token-crypto-new-yorkers, golden-token-incomplete-control
clus

In [48]:
print("Number of clustsers: ", len(np.unique(affprop.labels_)))

Number of clustsers:  230


## Evaluation
The problem with this model is that i can't get the model to predict the result since it calculates the levenshtien distance between all the words. This is good for clustering but i want to be able to project the word into a feature space and then predict which cluster it's closest to. As of right now, what i can do is calcualte the levenstien distance to the cluster heads, and then select one which is the closest.  

In [58]:
def predict_word(model, word, collection_names):
    lev_similarity = np.array([distance.levenshtein(collection_names[model.cluster_centers_indices_[cluster_id]],word) for cluster_id in np.unique(model.labels_)])
    (index, minimum_lev) = min(enumerate(lev_similarity), key=lambda x:x[1])
    return np.unique(collection_names[np.nonzero(model.labels_==np.unique(model.labels_)[index])])

In [59]:
predict_word(affprop, "crypto-monk", collection_names)

array(['abstract-cryptopunks', 'crypto-angry-dog', 'cryptoburbs',
       'cryptoelvez', 'cryptoogoldpunks', 'cryptopron-1', 'cryptopunk',
       'cryptopunk-', 'cryptopunk-mosaic', 'cryptopunk-original',
       'cryptopunks-', 'cryptopunks-covid', 'cryptopython-v2',
       'cryptosaurs1', 'cryptoskullyz', 'cryptowashington',
       'crytpo-snake', 'dystopunks', 'grumpy-mummy', 'litecryptopunks',
       'lovely-cryptopunks', 'neural-crypto-punks'], dtype='<U49')

# Conclusion
Using the affinity propagation i was able to cluster together collection names, then using a given word i was able to calculate the levenshtien distance from the cluster heads, finding the most similar cluster and then i displayed the cluster to show relevant collection names.

In [23]:
class Prediction_Model:
    def __init__(self, collection_names):
        self.collection_names = np.array(collection_names)
        self.lev_similarity = []
        self.model = None
        
    def train(self, damping=0.6, max_iter=400):
        print("Computing Levenshtien Matrix")
        self.lev_similarity = -1*np.array([[distance.levenshtein(c1,c2) for c1 in self.collection_names] for c2 in self.collection_names])
        print("Starting Model")
        self.model = AffinityPropagation(affinity="precomputed", damping=damping, max_iter=max_iter, verbose=True)
        self.model.fit(lev_similarity)
        
    def predict(self, word):
        lev_similarity = np.array([
            distance.levenshtein(self.collection_names[self.model.cluster_centers_indices_[cluster_id]],word) 
            for cluster_id in np.unique(self.model.labels_)])
        (index, minimum_lev) = min(enumerate(lev_similarity), key=lambda x:x[1])
        res = np.unique(self.collection_names[np.nonzero(self.model.labels_==np.unique(self.model.labels_)[index])])
        result = []
        for i in range(len(res)):
            result.append((res[i], distance.levenshtein(res[i], word)))
        result.sort(key=lambda x: x[1])
        return result[:6]

    def __str__(self):
        res = ""
        for i, cluster_id in enumerate(np.unique(affprop.labels_)):
            cluster_head = collection_names[affprop.cluster_centers_indices_[cluster_id]]
            cluster = np.unique(collection_names[np.nonzero(affprop.labels_==cluster_id)])
            cluster_str = ", ".join(cluster)
            res += str("cluster %i - *%s:* %s" % (i, cluster_head, cluster_str))
            res += "\n"
        return res

In [72]:
model = Prediction_Model(collection_names)

In [73]:
model.train()

Computing Levenshtien Matrix
Starting Model
Converged after 105 iterations.


In [74]:
with open('collection_model', 'wb') as f:
    pickle.dump(model,f)

In [24]:
with open('collection_model', 'rb') as f:
    y = pickle.load(f)
    
y.predict("ape-age")

TypeError: append() takes exactly one argument (2 given)

In [16]:
res = ['animalpunk-gang', 'ape-gang', 'ape-nfts', 'ape-yacht-1',
       'apesofneon', 'bango-man', 'caredino', 'casarugface',
       'fanggangnft', 'framergence', 'grillzgang', 'hapegang',
       'happy-2022', 'hok-main', 'latte-lounge', 'ntf-bag',
       'rare-cranium-0001', 'saibagangs', 'saitamainu', 'the-cow-gang',
       'the-wild-apes-gang', 'toucan-gang']

for i in range(len(res)):
    res[i] = (res[i], distance.levenshtein(res[i], "ape-age"))
    
res.sort(key=lambda x: x[1])

In [18]:
res[:6]

[('ape-gang', 3),
 ('ape-nfts', 4),
 ('hapegang', 4),
 ('ntf-bag', 5),
 ('ape-yacht-1', 6),
 ('apesofneon', 6)]