In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial import Voronoi
from scipy.stats import wilcoxon, ranksums
from shapely.geometry import Point, LineString
from shapely.ops import polygonize
from sklearn.cluster import KMeans
import numpy as np
from scipy.stats import wilcoxon
import matplotlib.pyplot as plt
import itertools

In [2]:
freq_frame = pd.read_csv(
    "/src/data/procesados/clean/TWITTER_COLOMBIA_FREQ_CLEAN.csv", 
    encoding='utf-8',
    index_col=0,
    sep ="\t",
    decimal=",",
    quotechar='"'
)

hsics_filter = pd.read_csv(
        "/src/data/procesados/HSIC/HSIC_PVALUE.csv", 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
)

hsics_filter=hsics_filter[hsics_filter.PValue<0.05].index

In [3]:
def filter_words_frame_file(hsic_fil, freq_frame, destino):
  
    words_to_drop = list(set(list(freq_frame.index))-set(hsic_fil))
    freq_frame.drop(labels=words_to_drop, inplace=True)
    vocabulary_size =  pd.DataFrame(freq_frame.sum(), columns=["#VocabularySize#"]).transpose()
    freq_frame=pd.concat([vocabulary_size, freq_frame])
    total = pd.DataFrame(freq_frame.transpose().apply(np.sum), columns=["#TOTALES_COLOMBIA#"])
    total=total.join(freq_frame)
    total=total.iloc[0:]/total.iloc[0]
    total.drop(labels="#VocabularySize#", inplace=True)
    total.to_csv(destino, sep="\t",decimal=",",header=total.columns)
    return total

In [46]:
filter_words=filter_words_frame_file(hsics_filter, freq_frame, "/src/data/procesados/filtered/pvalue005.csv")

In [6]:
def intersect_and_renormalize_vocabularies(origin, city1, city2):
    intersection = origin[[city1,city2]]
    #print(intersection.loc["guindando","Bogotá"])
    intersection = intersection.loc[(intersection != 0).all(1)]
    intersection = intersection.loc[(intersection != 0).any(1)]
    #print(intersection.loc["guindando","Bogotá"])
    vocabulary_size =  pd.DataFrame(intersection.sum(), columns=["#VocabularySize#"]).transpose()
    intersection=pd.concat([vocabulary_size, intersection])
    intersection=intersection.iloc[0:]/intersection.iloc[0]
    intersection.drop(labels="#VocabularySize#", inplace=True)
    return(intersection)

In [5]:
def smallest_vocabulary(origin, city1, city2):
    city1_frame = origin[[city1]]
    city1_frame = city1_frame.loc[(city1_frame != 0).all(1)]
    city1_len = len(city1_frame)
    city2_frame = origin[[city2]]
    city2_frame = city2_frame.loc[(city2_frame != 0).all(1)]
    city2_len = len(city2_frame)
    words_to_drop = []
    if city1_len < city2_len:
        words_to_drop = origin.index-city1_frame.index
    if city1_len > city2_len:
        words_to_drop = origin.index-city2_frame.index
    smallest=intersection = origin[[city1,city2]].drop(labels=words_to_drop)
    #print(intersection.loc["guindando","Bogotá"])
    vocabulary_size =  pd.DataFrame(smallest.sum(), columns=["#VocabularySize#"]).transpose()
    smallest=pd.concat([vocabulary_size, smallest])
    smallest=smallest.iloc[0:]/smallest.iloc[0]
    smallest.drop(labels="#VocabularySize#", inplace=True)
    return(smallest)


In [6]:
def cities_wilcoxon_and_dist(frame,method="wilcox"):
    ranking=frame.rank(method="average")
    groupa=ranking.iloc[:,0].mean()
    groupb=ranking.iloc[:,1].mean()
    groupc=(groupb+groupa)/4
    freq_dist = np.square(frame.iloc[:,0]-frame.iloc[:,1])
    rank_dist = np.square(ranking.iloc[:,0]-ranking.iloc[:,1])
    distancia = np.sqrt(freq_dist+rank_dist).sum()/((len(ranking)-1)*groupc)
    if method=="wilcox":
        n = len(freq_dist.loc[freq_dist!=0])
    elif method == "pratt" or method == "zsplit":
        n = len(freq_dist)
    T, pvalue = wilcoxon(frame.iloc[:,0],frame.iloc[:,1],zero_method=method)
    s = n*(n+1)/2
    remaining_ranksum = s-T
    proportion1 = T/s
    proportion2 = remaining_ranksum/s
    nwil = abs(proportion1-proportion2)
    
    return(distancia, nwil, T, pvalue)    

In [10]:
def wilcoxon_signedrank_matrix(origin, method="wilcox", normalize=False, v_method="All"):
    
    wilcoxon_dataframe = pd.DataFrame(columns=origin.columns, dtype=np.int32)
    pvalue_dataframe = pd.DataFrame(columns=origin.columns, dtype=np.float64)
    norm_wilcoxon_dataframe = pd.DataFrame(columns=origin.columns, dtype=np.float64)
    dist_dataframe = pd.DataFrame(columns=origin.columns, dtype=np.float64)
        
    if v_method=="all":
        destination_folder="/src/data/procesados/wilcoxon_signed/allcorpus/{}/".format(method)
       
    if normalize==True:
        destination_folder="/src/data/procesados/wilcoxon_signed/normalized_allcorpus/{}/".format(method)
        # renormalizar
        totales = pd.DataFrame(origin.apply(np.sum), columns=["#TOTALES#"]).transpose()
        origin = pd.concat([totales, origin])
        origin = origin.iloc[0:]/origin.iloc[0]
        origin.drop(labels=["#TOTALES#"], inplace=True)
    
    for column1 in origin:
        for column2 in origin:
            if v_method=="Intersected":
                destination_folder="/src/data/procesados/wilcoxon_signed/intersected_corpus/{}/".format(method)
                frame=intersect_and_renormalize_vocabularies(origin, column1, column2)
            if v_method=="Smallest":
                destination_folder="/src/data/procesados/wilcoxon_signed/smallest_corpus/{}/".format(method)
                frame=smallest_vocabulary(origin, column1, column2)
            elif v_method=="All":
                frame = origin[[column1,column2]]
            distancia, nwil, T, pvalue = cities_wilcoxon_and_dist(frame,method=method)
            dist_dataframe.loc[column1, column2]=distancia
            pvalue_dataframe.loc[column1, column2]=pvalue
            wilcoxon_dataframe.loc[column1, column2]=T
            norm_wilcoxon_dataframe.loc[column1, column2]=nwil
            print('\r{}:{} w:{} p:{}'.format(column1,column2,T,pvalue), end="\t\t")
    
    wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'stat.csv'),sep="\t",decimal=",",header=wilcoxon_dataframe.columns)
    pvalue_dataframe.to_csv('{}{}'.format(destination_folder,'pvalue.csv'),sep="\t",decimal=",",header=pvalue_dataframe.columns)
    norm_wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'nstat.csv'),sep="\t",decimal=",",header=norm_wilcoxon_dataframe.columns)
    dist_dataframe.to_csv('{}{}'.format(destination_folder,'distancia.csv'),sep="\t",decimal=",",header=dist_dataframe.columns)       

In [15]:
wilcoxon_signedrank_matrix(filter_words, method="wilcox", normalize=True)
wilcoxon_signedrank_matrix(filter_words, method="pratt", normalize=True)
wilcoxon_signedrank_matrix(filter_words, method="zsplit", normalize=True)
print("\nListo All Corpus Normalizado")
wilcoxon_signedrank_matrix(filter_words, method="wilcox", v_method="Intersected")
wilcoxon_signedrank_matrix(filter_words, method="pratt",  v_method="Intersected")
wilcoxon_signedrank_matrix(filter_words, method="zsplit", v_method="Intersected")
print("\nListo All Corpus Intersectado")
wilcoxon_signedrank_matrix(filter_words, method="wilcox", v_method="Smallest")
wilcoxon_signedrank_matrix(filter_words, method="pratt",  v_method="Smallest")
wilcoxon_signedrank_matrix(filter_words, method="zsplit", v_method="Smallest")



Zipaquirá:Zipaquirá w:63508945.5 p:1.0		765020886825		73e-96		-72		08								10										
Listo All Corpus Normalizado
Zipaquirá:Zipaquirá w:47227820.0 p:1.0		1959353053e-45			-47		22		136		-90		36		5		
Listo All Corpus Intersectado
#TOTALES_COLOMBIA#:Apartadó w:16471280.0 p:3.300000093129474e-53		e-66		



Acevedo:Agustín Codazi w:4591610.0 p:2.923128117832626e-57		-113				276e-05		106		



Zipaquirá:Zipaquirá w:63508945.5 p:1.0		274234793e-75		9		-37		-46		5			7		-186													

In [144]:
def filter_words_frame_file(hsic_fil, totals ,rel_freq_file, destino):
    words_data = pd.read_csv(
        rel_freq_file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    words_to_drop = list(set(list(words_data.index))-set(hsic_fil))
    words_data.drop(labels=words_to_drop, inplace=True)
    words_to_drop = list(set(list(totals.index))-set(hsic_fil))
    totals.drop(labels=words_to_drop, inplace=True)
    
    frame = [totals, words_data]
    words_data = pd.concat(frame, axis=1)
    
    words_data.to_csv(destino, sep="\t",decimal=",",header=words_data.columns)  

In [323]:
def cities_to_join(origin):
    join_cities=[]
    results = origin.loc[origin["SmWilcoxP"]>0.05]
    join_df = origin
    count=0
    for row in results.iterrows():
        join_cities.append([row[1]["Ciudad1"], row[1]["Ciudad2"]])
        count+=1
    LL = set(itertools.chain.from_iterable(join_cities)) 
    for a in LL:
        components = [x for x in join_cities if a in x]
        for i in components:
            join_cities.remove(i)
        join_cities += [list(set(itertools.chain.from_iterable(components)))]  
    
    rows_to_change=[]
    
    for element in join_cities:
        new_index = '-'.join(element)
        for row in join_df.iterrows():
            if row[1]["Ciudad1"] in element:  
                rows_to_change.append([row[0],"Ciudad1", new_index])
            if row[1]["Ciudad2"] in element:    
                rows_to_change.append([row[0],"Ciudad2", new_index])
    
    for element in rows_to_change:
        join_df.loc[element[0],element[1]]=element[2]
  
    return join_cities, join_df

In [324]:
def join_dialects(array, origin):
    new_origin = origin
    for element in array:
        new_index = '-'.join(element)
        new_origin[new_index]=origin.loc[:,element].sum(axis=1)/len(element)    
            new_origin.drop(labels=element,axis=1, inplace=True)
    return(new_origin)

IndentationError: unexpected indent (<ipython-input-324-926b2d2b090b>, line 6)

In [329]:
origin = pd.read_csv(
        "/src/data/procesados/filtered/pvalue005.csv",
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )

pv=pd.read_csv(
        "/src/data/procesados/wilcoxon_signed/smallest_corpus/wilcox/pvalue.csv",
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )

dist=gpd.read_file("/src/data/procesados/geo/Ecotones/ecotones.shp", encoding = 'utf-8')
def get_dialects(pvalue_file, freq_origin, iterations=100):
    freq_origin=freq_origin.drop("#TOTALES_COLOMBIA#",axis=1)
    results = origin.loc[origin["SmWilcoxP"]>0.05]
    for i in range(0,iterations):        
        count=0
        cjoin, new_cities=cities_to_join(cities_origin)
        dialects=join_dialects(cjoin, freq_origin)
        for row in cities_origin.iterrows():
            frame=smallest_vocabulary(dialects, row[1]["Ciudad1"], row[1]["Ciudad2"])
            distancia, nwil, T, pvalue = cities_wilcoxon_and_dist(frame,method="wilcox")
            if pvalue > 0.05:
                print("debe reducir un orden")
            new_cities.loc[count,"Ciudad1"]=row[1]["Ciudad1"]
            new_cities.loc[count,"Ciudad2"]=row[1]["Ciudad2"]
            new_cities.loc[count,"SmWilcoxP"]=pvalue
            count+=1
            print("\r{} {}".format(i, row[1]["Ciudad1"], row[1]["Ciudad2"], freq_origin.shape, dialects.shape), end="\t\t\t")
        freq_origin = dialects
        cities_origin = new_cities
    return(cities_origin)
get_dialects(pv, origin, iterations=1)



0 Granada			ra-Tuluá-Buga			



0 Paipa			debe reducir un ordencordia			
0 Ríohacha			debe reducir un orden	ia			
0 Ríohacha			debe reducir un orden
0 Nueva Granada			r			atapé-Concordia			

Unnamed: 0,Ciudad1,Ciudad2,AllPrattW,AllPrattPV,AllWilcoxW,AllWilcoxP,AllZsplitW,AllZsplitP,AllDist,IntPrattW,...,IntZsplitP,IntDist,SmPrattW,SmPrattP,SmWilcoxW,SmWilcoxP,SmZsplitW,SmZsplitP,SmDist,geometry
0,Ipiales,Sibundoy,0.17000,0.00000,0.10477,0.00000,0.14199,0.00000,0.60613,0.17488,...,0.00000,0.50595,0.21397,0.00000,0.21397,1.682491e-59,0.21397,0.00000,0.51431,LINESTRING (-77.12278071650792 0.6620322038305...
1,Ipiales,Mocoa,0.19796,0.00000,,0.81647,0.02724,0.00265,0.59416,0.09489,...,0.00000,0.55892,0.28742,0.00000,0.28742,1.201283e-90,0.28742,0.00000,0.59501,LINESTRING (-76.9325962321071 0.26368502881286...
2,Sibundoy,Mocoa,0.20717,0.00000,0.11407,0.00000,0.17066,0.00000,0.59993,0.25869,...,0.00000,0.50477,0.26337,0.00000,0.26337,2.925895e-76,0.26337,0.00000,0.50502,LINESTRING (-76.95290070573137 0.3267471060884...
3,Mocoa,Florencia,0.21308,0.00000,0.07381,0.00000,0.12940,0.00000,0.69578,0.18851,...,0.00000,0.55087,0.27853,0.00000,0.27853,4.196478e-85,0.27853,0.00000,0.56581,LINESTRING (-75.52466168525021 -0.003966609926...
4,Florencia,Leticia,0.45863,0.00000,0.28345,0.00000,0.35546,0.00000,0.79083,0.23942,...,0.00000,0.69729,0.28427,0.00000,0.28427,6.823613e-51,0.28427,0.00000,0.70666,LINESTRING (-72.97478698898742 -1.480394941055...
5,Mocoa,Acevedo,0.21766,0.00000,0.08277,0.00000,0.12509,0.00000,0.69730,0.18210,...,0.00000,0.54987,0.28942,0.00000,0.28942,9.205518e-92,0.28942,0.00000,0.55145,LINESTRING (-76.06601820636112 1.2305563504966...
6,Acevedo,Florencia,0.06971,0.00000,0.02318,0.02448,0.02430,0.00779,0.57265,0.03770,...,0.00318,0.50180,0.20340,0.00000,0.20340,1.118511e-70,0.20340,0.00000,0.54428,LINESTRING (-75.6235900983002 1.90504973222946...
7,Mocoa,Pitalito,0.20259,0.00000,0.06033,0.00000,0.10821,0.00000,0.69797,0.14859,...,0.00000,0.55036,0.25820,0.00000,0.25820,2.012262e-73,0.25820,0.00000,0.55387,LINESTRING (-76.12502283606987 1.2994932086653...
8,Pitalito,Acevedo,0.43939,0.00000,0.24646,0.00000,0.32326,0.00000,0.12511,0.28452,...,0.00000,0.15589,0.31512,0.00000,0.31512,1.576845e-166,0.31512,0.00000,0.18444,LINESTRING (-75.98271449207347 1.8728658026381...
9,San Agustín,Pitalito,0.44601,0.00000,0.24553,0.00000,0.32751,0.00000,0.10833,0.28945,...,0.00000,0.13646,0.30703,0.00000,0.30703,2.352003e-158,0.30703,0.00000,0.14754,LINESTRING (-76.18253837842444 1.8483475064883...


In [316]:
x=[['Buenaventura', 'Buga'],
 ['Buga', 'Tuluá'],
 ['Sogamoso', 'Yopal'],
 ['Rionegro', 'Guatapé'],
 ['Concordia', 'Medellín'],
 ['Medellín', 'Rionegro'],
 ['Cereté', 'Montería'],
 ['Fonseca', 'Valledupar'],
 ['Santa Marta', 'Fonseca']]

In [322]:
'Buenaventura' in ['Buenaventura', 'Buga', 'Medellín', 'Rionegro']

True