In [1]:
import scipy 
import numpy as np
import strsimpy 
from Bio import pairwise2
from Bio.Seq import Seq 
from Bio.pairwise2 import format_alignment 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import dbscan
from strsimpy.levenshtein import Levenshtein
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.longest_common_subsequence import LongestCommonSubsequence
from strsimpy.metric_lcs import MetricLCS
from strsimpy.ngram import NGram
from strsimpy.sorensen_dice import SorensenDice
from strsimpy import SIFT4
from sklearn.metrics.cluster import homogeneity_score
from sklearn.metrics.cluster import completeness_score 
from sklearn.metrics.cluster import v_measure_score




In [2]:
levenshtein = Levenshtein()
normalized_levenshtein = NormalizedLevenshtein()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
fourgram = NGram(4)
dice = SorensenDice(2)
s = SIFT4()
jarowinkler = JaroWinkler()

In [3]:

tagged_df = pd.read_csv (r'/home/gelaw/work-stuff/gocode/src/registry-experimental/consistency/rpc/google/cloud/apigeeregistry/v1/similarity/algorithms /vocab1000.csv')
tagged_df = tagged_df.drop(tagged_df.index[1000:])
word_labels = tagged_df.iloc[:, 0]
word_labels = word_labels.to_numpy()
tagged_words = tagged_df.iloc[:, 1]
tagged_words = tagged_words.to_numpy()

In [4]:
def extract_indices_lv(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return levenshtein.distance(data[i], data[j])
def extract_indices_nlv(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return normalized_levenshtein.distance(data[i], data[j])
def extract_indices_jarowinkler(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return jarowinkler.distance(data[i], data[j])
def extract_indices_lcs(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return lcs.distance(data[i], data[j])
def extract_indices_dice(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return dice.distance(data[i], data[j])
def extract_indices_sift4(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return s.distance(data[i], data[j])  

In [5]:
def compute_predicted_lables(data, algorithm, dbscan_eps, dbscan_min_samples):
    db = dbscan(data, metric=algorithm, eps=dbscan_eps, min_samples=dbscan_min_samples, algorithm='brute')
    return db[1]

In [6]:
data = tagged_words
data = np.arange(len(data)).reshape(-1, 1)

In [7]:

tagged_df = pd.read_csv (r' vocab1000.csv path ')
tagged_df = tagged_df.drop(tagged_df.index[1000:])
word_labels = tagged_df.iloc[:, 0]
word_labels = word_labels.to_numpy()
tagged_words = tagged_df.iloc[:, 1]
tagged_words = tagged_words.to_numpy()
data = tagged_words
X = np.arange(len(data)).reshape(-1, 1)


In [8]:
dbscan_eps_values =  [.1, .2, .3, .4, .5, .6, .7, .8, .9]
dbscan_min_samples = [2, 3, 4, 5, 6, 7, 8, 9]
best_eps = -1
best_min_value = -1
best_score = 0 
for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables(data = X, algorithm = extract_indices_dice, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [9]:
print(best_eps, best_min_value, best_score)
optimals = {}
optimals[dice]= best_eps, best_min_value, best_score

0.3 2 0.8887501840948729


In [10]:
best_eps = -1
best_min_value = -1
best_score = 0 
for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables(data = X, algorithm = extract_indices_jarowinkler, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [11]:
print(best_eps, best_min_value, best_score)
optimals[jarowinkler]= best_eps, best_min_value, best_score

0.1 2 0.8301625497707991


In [12]:
best_eps = -1
best_min_value = -1
best_score = 0 
for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables(data = X, algorithm = extract_indices_sift4, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [13]:
print(best_eps, best_min_value, best_score)
optimals[s]= best_eps, best_min_value, best_score

-1 -1 0


In [None]:
best_eps = -1
best_min_value = -1
best_score = 0 
for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables (data = X, algorithm = extract_indices_lcs, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [None]:
print(best_eps, best_min_value, best_score)
optimals[lcs]= best_eps, best_min_value, best_score

-1 -1 0


In [None]:
best_eps = -1
best_min_value = -1
best_score = 0 
def extract_indices_fourgram(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return fourgram.distance(data[i], data[j])  

for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables(data = X, algorithm = extract_indices_fourgram, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [None]:
print(best_eps, best_min_value, best_score)
optimals[fourgram]= best_eps, best_min_value, best_score

-1 -1 0


In [None]:
dbscan_eps_values =  [.1, .2, .3, .4, .5, .6, .7, .8, .9]
dbscan_min_samples = [2, 3, 4, 5, 6, 7, 8, 9]
best_eps = -1
best_min_value = -1
best_score = 0 
for i in dbscan_eps_values:
    for j in dbscan_min_samples:
        lables = compute_predicted_lables(data = X, algorithm = extract_indices_nlv, dbscan_eps = i, dbscan_min_samples = j)
        current_score = v_measure_score(word_labels, lables)
        if (current_score > best_score):
            best_score = current_score
            best_eps = i
            best_min_value = j
        #print(i, j, current_score)

In [None]:
print(best_eps, best_min_value, best_score)
optimals[normalized_levenshtein]= best_eps, best_min_value, best_score

-1 -1 0


In [None]:
import csv
with open('final_values.csv', 'w') as f:
    for key in optimals.keys():
        f.write("%s,%s\n"%(key,optimals[key]))