In [191]:
#!/usr/bin/env python
"""
The Needleman-Wunsch Algorithm
==============================
This is a dynamic programming algorithm for finding the optimal alignment of
two strings.
Example
-------
    >>> x = "GATTACA"
    >>> y = "GCATGCU"
    >>> print(nw(x, y))
    G-ATTACA
    GCA-TGCU
LICENSE
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
"""

import numpy as np

def nw(x, y, match = 1, mismatch = 1, gap = 1):
    nx = len(x)
    ny = len(y)
    # Optimal score at each possible pair of characters.
    F = np.zeros((nx + 1, ny + 1))
    F[:,0] = np.linspace(0, -nx, nx + 1)
    F[0,:] = np.linspace(0, -ny, ny + 1)
    # Pointers to trace through an optimal aligment.
    P = np.zeros((nx + 1, ny + 1))
    P[:,0] = 3
    P[0,:] = 4
    # Temporary scores.
    t = np.zeros(3)
    for i in range(nx):
        for j in range(ny):
            if x[i] == y[j]:
                t[0] = F[i,j] + match
            else:
                t[0] = F[i,j] - mismatch
            t[1] = F[i,j+1] - gap
            t[2] = F[i+1,j] - gap
            tmax = np.max(t)
            F[i+1,j+1] = tmax
            if t[0] == tmax:
                P[i+1,j+1] += 2
            if t[1] == tmax:
                P[i+1,j+1] += 3
            if t[2] == tmax:
                P[i+1,j+1] += 4
    # Trace through an optimal alignment.
    i = nx
    j = ny
    rx = []
    ry = []
    while i > 0 or j > 0:
        if P[i,j] in [2, 5, 6, 9]:
            rx.append(x[i-1])
            ry.append(y[j-1])
            i -= 1
            j -= 1
        elif P[i,j] in [3, 5, 7, 9]:
            rx.append(x[i-1])
            ry.append('-')
            i -= 1
        elif P[i,j] in [4, 6, 7, 9]:
            rx.append('-')
            ry.append(y[j-1])
            j -= 1
    # Reverse the strings.
    rx = ''.join(rx)[::-1]
    ry = ''.join(ry)[::-1]
    return '\n'.join([rx, ry])

In [192]:
import pandas as pd

datos = pd.read_csv('dataframe.csv',sep=',')

In [193]:
datos

Unnamed: 0.1,Unnamed: 0,LanguageId,LanguageName,Latitude,Longitude,WordId,WordModernName1,Phonetic,SpellingAltv1,source_csv
0,0,28001000000,Ref. Spelling,-40.00000,-75.50000,1070,ellos,fëjëŋɘn,feyengün,12-feyengün.csv
1,1,28131006609,Santa Bárbara,-37.67405,-71.80186,1070,ellos,ˌvë.jë.ˈŋɘn,,12-feyengün.csv
2,2,28131007109,Cañete,-37.96722,-73.39282,1070,ellos,cʰɪ.ˈðwɪ.ŋɘn,,12-feyengün.csv
3,3,28131009109,Tirúa,-38.36990,-73.49067,1070,ellos,ˈɸë.jë.ɣ̞ɐ̝,,12-feyengün.csv
4,4,28131009809,Alto Bío Bío,-38.04457,-71.36344,1070,ellos,m̩.ˌvë.jë.ˈŋɘn,,12-feyengün.csv
...,...,...,...,...,...,...,...,...,...,...
1336,1336,28261007209,Huiliches,-39.63992,-71.18686,20610,vino,pʊl.ˈkʊ,,9-pulkü.csv
1337,1337,28271006509,Jacobacci,-41.28813,-69.55719,20610,vino,ɸʊl.ˈkʰʊ,,9-pulkü.csv
1338,1338,28281000209,Cushamen,-42.23392,-71.34994,20610,vino,pʰʊl.ˈkʰʊ,,9-pulkü.csv
1339,1339,28281002209,Futaleufú,-42.95052,-71.18351,20610,vino,ɸʊl.ˈxʊ,,9-pulkü.csv


In [194]:
names = set(datos['LanguageName'])

In [195]:
names

{'Alto Bío Bío',
 'Aluminé',
 'Angol',
 'Cañete',
 'Chalileo',
 'Chol Chol',
 'Cunco',
 'Curarrehue',
 'Cushamen',
 'Dollinco',
 'Ercilla',
 'Freire',
 'Futaleufú',
 'Galvarino',
 'Huiliches',
 'Icalma',
 'Jacobacci',
 'Junín de los Andes',
 'Lago Rosario',
 'Lanco',
 'Lonquimay',
 'Lumaco',
 'Mariquina',
 'Nueva Toltén',
 'Panguipulli',
 'Picunches',
 'Puerto Saavedra',
 'Ref. Spelling',
 'S. Juan de la Costa',
 'San Pablo',
 'Santa Bárbara',
 'Tirúa',
 'Truf Truf',
 'Valdivia',
 'Victoria',
 'Vilcún',
 'Villarrica',
 'Zapala'}

In [196]:
palabras = set(datos['WordModernName1'])

In [197]:
palabras

{'amigo',
 'bajar',
 'boca',
 'bueno',
 'cama',
 'camino',
 'campo',
 'cochayuyo',
 'cola',
 'coser',
 'cuerno',
 'dia',
 'ellos',
 'estrella',
 'fuego',
 'grande',
 'harina_tostada',
 'hoja',
 'lengua',
 'lenya',
 'llorar',
 'luna',
 'mano',
 'matar',
 'negro',
 'noble',
 'nombre',
 'nube',
 'persona_que_ensenya',
 'pie',
 'puerta',
 'rojo',
 'saber',
 'sangre',
 'sol',
 'tengo_hambre',
 'tres',
 'vestido_lana',
 'vino',
 'zorro'}

In [274]:
palabras_elegidas = ['tres'] ## puedes agregar más

In [275]:
dict_datos = {L:{} for L in names}

In [276]:
for L in names:
    for word in palabras_elegidas:
        D = datos[datos['LanguageName']==L]
        try:
            dict_datos[L][word]=list(D[D['WordModernName1']==word]['Phonetic'])[0]
        except IndexError:
            pass#dict_datos[L][word]='unk'

In [277]:
set_palabras = []

for L in names:
    set_palabras+=[list(dict_datos[L].keys())]

In [278]:
set_palabras = list(set.intersection(*map(set,set_palabras)))

In [279]:
set_palabras

['tres']

In [280]:
dict_datos = {name:{word:dict_datos[name][word] for word in set_palabras} for name in names}

In [291]:
dict_datos['San Pablo']

{'tres': 'ˈkʰɘ.lɐ̝'}

In [282]:
dict_datos['S. Juan de la Costa']

{'tres': 'ˈxɘ.lə'}

In [283]:
print(nw('f̞ë.jë.ˈŋɘn','ˈfëj.ŋɘn'))

-f̞ë.jë.ˈŋɘn
ˈf-ë-j--.-ŋɘn


In [284]:
import itertools
pares_localidades = list(itertools.product(list(dict_datos.keys()), list(dict_datos.keys())))

In [285]:
pares_localidades

[('Icalma', 'Icalma'),
 ('Icalma', 'Huiliches'),
 ('Icalma', 'Villarrica'),
 ('Icalma', 'Ercilla'),
 ('Icalma', 'Chol Chol'),
 ('Icalma', 'Cañete'),
 ('Icalma', 'Cunco'),
 ('Icalma', 'Ref. Spelling'),
 ('Icalma', 'Chalileo'),
 ('Icalma', 'Galvarino'),
 ('Icalma', 'Puerto Saavedra'),
 ('Icalma', 'Junín de los Andes'),
 ('Icalma', 'Panguipulli'),
 ('Icalma', 'Freire'),
 ('Icalma', 'Alto Bío Bío'),
 ('Icalma', 'Dollinco'),
 ('Icalma', 'Nueva Toltén'),
 ('Icalma', 'Victoria'),
 ('Icalma', 'Lago Rosario'),
 ('Icalma', 'Zapala'),
 ('Icalma', 'Santa Bárbara'),
 ('Icalma', 'Mariquina'),
 ('Icalma', 'Tirúa'),
 ('Icalma', 'S. Juan de la Costa'),
 ('Icalma', 'Lumaco'),
 ('Icalma', 'Vilcún'),
 ('Icalma', 'Valdivia'),
 ('Icalma', 'Futaleufú'),
 ('Icalma', 'Curarrehue'),
 ('Icalma', 'Jacobacci'),
 ('Icalma', 'Picunches'),
 ('Icalma', 'Lanco'),
 ('Icalma', 'Aluminé'),
 ('Icalma', 'Lonquimay'),
 ('Icalma', 'Angol'),
 ('Icalma', 'Cushamen'),
 ('Icalma', 'Truf Truf'),
 ('Icalma', 'San Pablo'),
 ('Huilic

In [286]:
## aquí guardas las distancias

distancias = {L:{LL:0 for LL in list(dict_datos.keys())} for L in list(dict_datos.keys())}

In [287]:
!pip install jellyfish



In [288]:
import jellyfish

def funcion_distancia(string1,string2):
    #return 1-jellyfish.jaro_winkler_similarity(string1,string2)
    alignment = nw(string1,string2).split('\n')
    string1 = alignment[0]
    string2 = alignment[1]
    return jellyfish.levenshtein_distance(string1,string2)/(max(len(string1),len(string2)))

In [289]:
## tienes q recorrer pares_localidades

for par in pares_localidades:
    D = 0
    words1 = dict_datos[par[0]]
    words2 = dict_datos[par[1]]
    words1and2 = list(set(words1) & set(words2))
    for word in words1and2:
        D += funcion_distancia(dict_datos[par[0]][word],dict_datos[par[1]][word])
    distancias[par[0]][par[1]]=D/len(words1and2)

In [292]:
dict(sorted(distancias['Chol Chol'].items(), key=lambda item: item[1]))

{'Chol Chol': 0.0,
 'Cañete': 0.0,
 'Galvarino': 0.0,
 'Puerto Saavedra': 0.0,
 'Panguipulli': 0.0,
 'Freire': 0.0,
 'Victoria': 0.0,
 'Vilcún': 0.0,
 'Valdivia': 0.0,
 'Curarrehue': 0.0,
 'Angol': 0.0,
 'San Pablo': 0.0,
 'Mariquina': 0.125,
 'Tirúa': 0.125,
 'Lumaco': 0.125,
 'Ercilla': 0.2222222222222222,
 'Cunco': 0.2222222222222222,
 'Chalileo': 0.2222222222222222,
 'Lago Rosario': 0.2222222222222222,
 'Santa Bárbara': 0.2222222222222222,
 'Cushamen': 0.2222222222222222,
 'Nueva Toltén': 0.3,
 'Aluminé': 0.3,
 'Alto Bío Bío': 0.3333333333333333,
 'Zapala': 0.3333333333333333,
 'Futaleufú': 0.3333333333333333,
 'Ref. Spelling': 0.375,
 'Lanco': 0.375,
 'Dollinco': 0.4,
 'Jacobacci': 0.4,
 'Truf Truf': 0.4,
 'Lonquimay': 0.4166666666666667,
 'Icalma': 0.4444444444444444,
 'Junín de los Andes': 0.4444444444444444,
 'Picunches': 0.4444444444444444,
 'Huiliches': 0.45454545454545453,
 'Villarrica': 0.45454545454545453,
 'S. Juan de la Costa': 0.5}

In [272]:
DF = pd.DataFrame.from_dict(distancias)

In [273]:
DF

Unnamed: 0,Icalma,Huiliches,Villarrica,Ercilla,Chol Chol,Cañete,Cunco,Ref. Spelling,Chalileo,Galvarino,...,Curarrehue,Jacobacci,Picunches,Lanco,Aluminé,Lonquimay,Angol,Cushamen,Truf Truf,San Pablo
Icalma,0.0,0.131746,0.088095,0.209503,0.149263,0.282261,0.243341,0.397846,0.164768,0.186116,...,0.141446,0.145413,0.188162,0.249278,0.223276,0.226563,0.155471,0.152579,0.165051,0.404932
Huiliches,0.131746,0.0,0.144426,0.235297,0.119032,0.315613,0.228146,0.428231,0.158787,0.213533,...,0.097458,0.151793,0.212369,0.248196,0.193821,0.268282,0.199956,0.169643,0.173024,0.423851
Villarrica,0.088095,0.144426,0.0,0.23079,0.153358,0.266059,0.229853,0.391071,0.136111,0.220027,...,0.113969,0.103914,0.22601,0.204095,0.196771,0.242306,0.180427,0.144246,0.178345,0.373624
Ercilla,0.209503,0.235297,0.23079,0.0,0.233936,0.364092,0.287706,0.450894,0.241819,0.253041,...,0.226595,0.270444,0.305393,0.378788,0.319764,0.309955,0.250745,0.241901,0.280762,0.423109
Chol Chol,0.149263,0.119032,0.153358,0.233936,0.0,0.2822,0.1928,0.424972,0.180272,0.183078,...,0.100242,0.148526,0.204772,0.235299,0.190193,0.25839,0.177891,0.141015,0.200368,0.371202
Cañete,0.282261,0.315613,0.266059,0.364092,0.2822,0.0,0.349954,0.418066,0.308542,0.212539,...,0.285498,0.280764,0.360083,0.24708,0.331499,0.245031,0.266758,0.26559,0.2885,0.356539
Cunco,0.243341,0.228146,0.229853,0.287706,0.1928,0.349954,0.0,0.447996,0.19241,0.299263,...,0.216638,0.2036,0.313598,0.297405,0.205584,0.352294,0.244072,0.159694,0.240511,0.35762
Ref. Spelling,0.397846,0.428231,0.391071,0.450894,0.424972,0.418066,0.447996,0.0,0.427664,0.39865,...,0.416239,0.395362,0.46124,0.418592,0.429473,0.458719,0.429705,0.377778,0.402069,0.541695
Chalileo,0.164768,0.158787,0.136111,0.241819,0.180272,0.308542,0.19241,0.427664,0.0,0.237567,...,0.121086,0.146609,0.216989,0.260823,0.223763,0.300425,0.200113,0.167942,0.230697,0.381888
Galvarino,0.186116,0.213533,0.220027,0.253041,0.183078,0.212539,0.299263,0.39865,0.237567,0.0,...,0.207243,0.245692,0.269818,0.252391,0.27157,0.169112,0.16364,0.255612,0.214657,0.29652


In [252]:
DF.to_excel('distancias.xlsx')

In [253]:
## construyamos un heatmap :) https://seaborn.pydata.org/generated/seaborn.heatmap.html

import seaborn as sns
import scipy.spatial as sp, scipy.cluster.hierarchy as hc
import matplotlib.pyplot as plt
import numpy as np

def mapa_calor(D):
    
    
    linkage = hc.linkage(sp.distance.squareform(DF), method='ward')
        
    plt.figure(figsize=(10,10))
    cg=sns.clustermap(D,cmap='OrRd',row_linkage=linkage, col_linkage=linkage,linewidth=1,yticklabels=True,xticklabels=True,cbar_kws={'shrink': 0.3})
       
    plt.rcParams.update({'font.size': 12})
    #plt.savefig('heatmap.jpg', format='jpg', transparent=True, bbox_inches='tight',dpi=800)

In [254]:
mapa_calor(distancias)

ValueError: Distance matrix 'X' must be symmetric.

In [135]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

labels_dict = {}

range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
S = []
for n_clusters in range_n_clusters:
    
    cluster = KMeans(n_clusters=n_clusters)#, linkage='ward')
    distArray = sp.distance.squareform(DF.to_numpy())
    cluster_labels = cluster.fit_predict(DF.values)
    labels_dict[n_clusters]=cluster_labels
    silhouette_avg = silhouette_score(DF.values, cluster_labels, metric = 'precomputed')
    S.append(silhouette_avg)
    print("For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_avg)

For n_clusters = 2 The average silhouette_score is : 0.2815967199263774
For n_clusters = 3 The average silhouette_score is : 0.21095615185928318
For n_clusters = 4 The average silhouette_score is : 0.1005812906379821
For n_clusters = 5 The average silhouette_score is : 0.16450661554367252
For n_clusters = 6 The average silhouette_score is : 0.08215341895174555
For n_clusters = 7 The average silhouette_score is : 0.024783478257200703
For n_clusters = 8 The average silhouette_score is : 0.1005809334924887
For n_clusters = 9 The average silhouette_score is : 0.060479976243193845
For n_clusters = 10 The average silhouette_score is : 0.06906079185743802
