In [3]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon, ranksums
import matplotlib.pyplot as plt

In [8]:
TOT_FREQ_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_FREQ_CLEAN.csv"
REL_FREQ_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_RELATIVE_FREQ_CLEAN.csv"
CITIES_ORIGIN = "/src/data/procesados/geo/filter_cities_coordinates.csv"
HSIC_ORIGIN = "/src/data/procesados/HSIC/HSIC_PVALUE.csv"
DESTINO = "/src/data/procesados/filtered/Filter20MILData.csv"

In [9]:
def colombia_norm_freq(freq_file):
    tot_freqs = pd.read_csv(
        freq_file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    ).transpose()
    
    total = pd.DataFrame(tot_freqs.apply(np.sum), columns=["#TOTALES_COLOMBIA#"])
    del tot_freqs
    total_norm = (total.iloc[0:]/total.iloc[0]).drop("#TOTAL_WORDS#")
    return(total_norm)

In [10]:
colombia = colombia_norm_freq(TOT_FREQ_ORIGIN)

In [11]:
def hsic_filter(hsic_filter_file):
    hsics = pd.read_csv(
        hsic_filter_file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    print(len(hsics))
    hsics=hsics[hsics.PValue<0.05]
    print(len(hsics))
    hsic_fil = hsics.index
    return(hsic_fil)

In [12]:
 #hsic_fil=hsic_filter(HSIC_ORIGIN )

20000
15938


In [8]:
def filter_words_frame_file(hsic_fil, totals ,rel_freq_file, destino):
    words_data = pd.read_csv(
        rel_freq_file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    words_to_drop = list(set(list(words_data.index))-set(hsic_fil))
    words_data.drop(labels=words_to_drop, inplace=True)
    words_to_drop = list(set(list(totals.index))-set(hsic_fil))
    totals.drop(labels=words_to_drop, inplace=True)
    
    frame = [totals, words_data]
    words_data = pd.concat(frame, axis=1)
    
    words_data.to_csv(destino, sep="\t",decimal=",",header=words_data.columns)  

In [85]:
#filter_words_frame_file(hsic_fil, colombia, REL_FREQ_ORIGIN, DESTINO)

In [43]:
def corpus_statistis(origin, pueblo1, pueblo2):
    words_data = pd.read_csv(
        origin, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    pueblo1_frame=pd.DataFrame(words_data[pueblo1])
    pueblo2_frame=pd.DataFrame(words_data[pueblo2])
    pueblo1_frame=pueblo1_frame[getattr(pueblo1_frame,pueblo1) != 0]
    pueblo2_frame=pueblo2_frame[getattr(pueblo2_frame,pueblo2) != 0]
    pueblo1_len = len(pueblo1_frame)
    pueblo2_len = len(pueblo2_frame)
    pueblo1_mean = pueblo1_frame.stack().mean()
    pueblo2_mean = pueblo2_frame.stack().mean()
    pueblo1_median = pueblo1_frame.stack().median()
    pueblo2_median = pueblo2_frame.stack().median()
    return(pueblo1_len, pueblo2_len, pueblo1_mean, pueblo2_mean, pueblo1_median, pueblo2_median)

In [45]:
corpus_statistis(DESTINO, "Medellín", "Bogotá")

(15878,
 15915,
 4.8062218751686614e-05,
 4.8013203563900886e-05,
 2.6491270005748604e-06,
 2.6468636822069587e-06)

In [72]:
def wilcoxon_signedrank_matrix_all_corpus(origin, method="wilcox", normalize=False, zeros=False):
    destination_folder="/src/data/procesados/wilcoxon_signed/allcorpus/{}/".format(method)
    words_data = pd.read_csv(
        origin, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.int32)
    pvalue_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    norm_wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    dist_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    
    if normalize==True:
        destination_folder="/src/data/procesados/wilcoxon_signed/normalized_allcorpus/{}/".format(method)
        # renormalizar
        totales = pd.DataFrame(words_data.apply(np.sum), columns=["#TOTALES#"]).transpose()
        words_data = pd.concat([totales, words_data])
        words_data = words_data.iloc[0:]/words_data.iloc[0]
        words_data.drop(labels=["#TOTALES#"], inplace=True)
        
    if zeros:
        destination_folder="{}/{}/{}".format(destination_folder,zeros,method)
                   
    for column1 in words_data:
        for column2 in words_data:
            diferencia = pd.DataFrame(words_data[column1]-words_data[column2], columns=["diferencia"])
            diferencia_prom = np.float64(abs(diferencia.sum())/len(diferencia))
            if zeros:
                pueblo1_len, pueblo2_len, pueblo1_mean, pueblo2_mean, pueblo1_median, pueblo2_median=corpus_statistis(origin, column1, column2)
                if zeros=="maxmedian":      
                    if pueblo1_len > pueblo2_len:
                        words_data[column2]=words_data[column2].replace(0,pueblo1_median)
                    if pueblo1_len < pueblo2_len:
                        words_data[column1]=words_data[column1].replace(0,pueblo2_median)        
                if zeros=="conjmedian":
                    words_data[column2]=words_data[column2].replace(0,pueblo1_median)
                    words_data[column1]=words_data[column1].replace(0,pueblo2_median)
            if method == "wilcox":
                ceros = len(diferencia[diferencia.diferencia == 0])
                n = len(diferencia)-ceros
            elif method == "pratt" or method == "zsplit":
                n = len(words_data[column2])
            # s = n*(n+1)/2 este no hay que buscar porque funciona con 4
            s = n*(n+1)/4
            wil, pvalue = wilcoxon(words_data[column1],words_data[column2],zero_method=method)
            nwil = wil/s
            print('\r{}:{} w:{} p:{}'.format(column1,column2,wil,pvalue), end="\t\t")
            pvalue_dataframe.loc[column1, column2]=pvalue
            dist_dataframe.loc[column1, column2]=diferencia_prom
            if pvalue < 0.05:
                wilcoxon_dataframe.loc[column1, column2]=wil
                norm_wilcoxon_dataframe.loc[column1, column2]=nwil
            else:
                wilcoxon_dataframe.loc[column1, column2]=np.nan
                norm_wilcoxon_dataframe.loc[column1, column2]=np.nan
        print('\nTerminado {}'.format(column1), end="\n")
 
    wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'stat.csv'),sep="\t",decimal=",",header=wilcoxon_dataframe.columns)
    pvalue_dataframe.to_csv('{}{}'.format(destination_folder,'pvalue.csv'),sep="\t",decimal=",",header=pvalue_dataframe.columns)
    norm_wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'nstat.csv'),sep="\t",decimal=",",header=norm_wilcoxon_dataframe.columns)
    dist_dataframe.to_csv('{}{}'.format(destination_folder,'distancia.csv'),sep="\t",decimal=",",header=dist_dataframe.columns)
    print('\nTerminado {}/{}'.format(method,zero), end="\n")

In [73]:
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox")
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt")
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit")
#print("\nListo All Corpus")
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox", normalize=True)
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt", normalize=True)
#wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit", normalize=True)
#print("\nListo All Corpus Normalizado")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt", zeros="maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit", zeros="maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox", zeros="maxmedian")
print("\nListo All Corpus maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt", zeros="conjmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit", zeros="conjmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox", zeros="conjmedian")
print("\nListo All Corpus conjmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt", normalize=True, zeros="maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit", normalize=True, zeros="maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox", normalize=True, zeros="maxmedian")
print("\nListo All Corpus normalizado maxmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="pratt", normalize=True, zeros="conjmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="zsplit", normalize=True, zeros="conjmedian")
wilcoxon_signedrank_matrix_all_corpus(DESTINO, method="wilcox", normalize=True, zeros="conjmedian")
print("\nListo All Corpus normalizado conjmedian")

#TOTALES_COLOMBIA#:Zipaquirá w:61600678.0 p:0.0010191849672664555				8e-181		46		
Terminado #TOTALES_COLOMBIA#
Acevedo:Zipaquirá w:40123049.0 p:0.0		0926138456e-170		71025e-87		61		
Terminado Acevedo
Aguachica:Zipaquirá w:45072566.0 p:4.096179230923277e-221		206		165		9			
Terminado Aguachica
Agustín Codazi:Zipaquirá w:47464111.0 p:5.266750664635652e-168		144		29		36		
Terminado Agustín Codazi
Albania:Zipaquirá w:45129316.0 p:8.642386931255364e-220		98		-78		14		
Terminado Albania
Andes:Zipaquirá w:47989502.0 p:2.584245278994446e-157		31		-201		82		
Terminado Andes
Apartadó:Zipaquirá w:44394016.0 p:1.587545257241938e-237		41		-129						
Terminado Apartadó
Arauca:Zipaquirá w:43311338.0 p:6.03853928770277e-265		256		36		-38		
Terminado Arauca
Armenia:Zipaquirá w:52587396.0 p:7.264204786586766e-79		-62		268				
Terminado Armenia
Baranoa:Zipaquirá w:63348554.0 p:0.7824514010289454							05e-224		3			
Terminado Baranoa
Barbosa_Antioquia:Zipaquirá w:51990706.0 p:1.6478466452647962e-87		67		-

KeyboardInterrupt: 

In [78]:
def wilcoxon_signedrank_matrix_intersected_corpus_renormaliced_data(orgin, method="wilcox"):
    destination_folder="/src/data/procesados/wilcoxon_signed/intersected_corpus/{}/".format(method)
    words_data = pd.read_csv(
        orgin, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.int32)
    pvalue_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    norm_wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    dist_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    for column1 in words_data:
        # eliminar ceros
        x=words_data[column1]
        for column2 in words_data:
            y=words_data[column2]
            #Intersected corpus
            intersection = pd.concat([x,y], axis=1)  
            intersection = intersection[getattr(intersection,column1) != 0]
            intersection = intersection[getattr(intersection,column2) != 0]
            # renormalizar
            totales = pd.DataFrame(intersection.apply(np.sum), columns=["#TOTALES#"]).transpose()
            intersection = pd.concat([totales, intersection])
            intersection = intersection.iloc[0:]/intersection.iloc[0]
            intersection.drop(labels=["#TOTALES#"], inplace=True)
            x1 = intersection.iloc[0:,0]
            y1 = intersection.iloc[0:,1]
            # calcular ceros para método wilcox:
            if  (column1=="Bogotá" and column2=="Medellín"):
                return(intersection.rank())
            diferencia = pd.DataFrame(x1-y1, columns=["diferencia"])
            diferencia_prom = np.float64(abs(diferencia.sum())/len(diferencia))
            if method == "wilcox":
                ceros = len(diferencia[diferencia.diferencia == 0])
                n = len(diferencia)-ceros
            elif method == "pratt" or method == "zsplit":
                n = len(x1)
            # s = n*(n+1)/2 este no hay que buscar porque funciona con 4
            s = n*(n+1)/4
            wil, pvalue = wilcoxon(x1,y1,zero_method=method)
            nwil = wil/s
            dist_dataframe.loc[x1.name, y1.name]=diferencia_prom
            pvalue_dataframe.loc[x1.name, y1.name]=pvalue
            print('\r{}:{} w:{} p:{}'.format(column1,column2,wil,pvalue), end="\t\t")
            if pvalue < 0.05:
                wilcoxon_dataframe.loc[x1.name, y1.name]=wil
                norm_wilcoxon_dataframe.loc[x1.name, y1.name]=nwil
            else:
                wilcoxon_dataframe.loc[x1.name, y1.name]=np.nan
                norm_wilcoxon_dataframe.loc[x1.name, y1.name]=np.nan
    wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'stat.csv'),sep="\t",decimal=",",header=wilcoxon_dataframe.columns)
    pvalue_dataframe.to_csv('{}{}'.format(destination_folder,'pvalue.csv'),sep="\t",decimal=",",header=pvalue_dataframe.columns)
    norm_wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'nstat.csv'),sep="\t",decimal=",",header=norm_wilcoxon_dataframe.columns)
    dist_dataframe.to_csv('{}{}'.format(destination_folder,'distancia.csv'),sep="\t",decimal=",",header=dist_dataframe.columns)

In [79]:
wilcoxon_signedrank_matrix_intersected_corpus_renormaliced_data(DESTINO, method="pratt")
wilcoxon_signedrank_matrix_intersected_corpus_renormaliced_data(DESTINO, method="wilcox")
wilcoxon_signedrank_matrix_intersected_corpus_renormaliced_data(DESTINO, method="zsplit")

Bogotá:Mariquita w:11515259.0 p:1.1965597254413652e-57				7e-60				27						1			6		



Bogotá:Mariquita w:11515259.0 p:1.1965597254413652e-57				7e-60				27						1			6		

Unnamed: 0,Bogotá,Medellín
que,15857.0,15857.0
la,15856.0,15856.0
y,15855.0,15855.0
el,15854.0,15854.0
a,15853.0,15853.0
no,15851.0,15851.0
en,15852.0,15852.0
me,15849.0,15850.0
es,15850.0,15849.0
mi,15846.0,15843.0


In [22]:
def wilcoxon_signedrank_matrix_united_corpus_renormaliced_data(orgin, method="wilcox", zeros=False):
    destination_folder="/src/data/procesados/wilcoxon_signed/united_corpus/{}/".format(method)
    words_data = pd.read_csv(
        orgin, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.int32)
    pvalue_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    norm_wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    dist_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    
    if zeros:
        destination_folder="{}/{}".format(destination_folder,method)
        
    for column1 in words_data:
        # eliminar ceros
        x=pd.DataFrame(words_data[column1])
        x=x[getattr(x,column1) != 0]
        for column2 in words_data:
            y=pd.DataFrame(words_data[column2])
            y=y[getattr(y,column2) != 0]
            #United corpus  
            union = pd.concat([x,y], axis=1)
            # renormalizar
            totales = pd.DataFrame(union.apply(np.sum), columns=["#TOTALES#"]).transpose()
            union = pd.concat([totales, union])
            union = union.iloc[0:]/union.iloc[0]
            union.drop(labels=["#TOTALES#"], inplace=True)
            if zeros:
                pueblo1_len, pueblo2_len, pueblo1_mean, pueblo2_mean, pueblo1_median, pueblo2_median=corpus_statistis(origin, column1, column2)
                if zeros=="maxmedian":
                    if pueblo1_len > pueblo2_len:
                        union[column2]=union[column2].replace(0,pueblo1_median)
                    if pueblo1_len < pueblo2_len:
                        union[column1]=union[column1].replace(0,pueblo2_median)
                if zeros=="conjmedian":
                    union[column2]=union[column2].replace(0,pueblo1_median)
                    union[column1]=union[column1].replace(0,pueblo2_median)
            x1 = union.iloc[0:,0]
            y1 = union.iloc[0:,1]
            # calcular ceros para método wilcox:
            diferencia = pd.DataFrame(x1-y1, columns=["diferencia"])
            diferencia_prom = np.float64(abs(diferencia.sum())/len(diferencia))
            if method == "wilcox":
                ceros = len(diferencia[diferencia.diferencia == 0])
                n = len(diferencia)-ceros
            elif method == "pratt" or method == "zsplit":
                n = len(x1)
            # s = n*(n+1)/2 este no hay que buscar porque funciona con 4
            s = n*(n+1)/4
            wil, pvalue = wilcoxon(x1,y1,zero_method=method)
            nwil = wil/s
            print('\r{}:{} w:{} p:{}'.format(column1,column2,wil,pvalue), end="\t\t")
            if pvalue < 0.05:
                wilcoxon_dataframe.loc[x1.name, y1.name]=wil
                pvalue_dataframe.loc[x1.name, y1.name]=pvalue
                norm_wilcoxon_dataframe.loc[x1.name, y1.name]=nwil
            else:
                wilcoxon_dataframe.loc[x1.name, y1.name]=np.nan
                pvalue_dataframe.loc[x1.name, y1.name]=np.nan
                norm_wilcoxon_dataframe.loc[x1.name, y1.name]=np.nan
            dist_dataframe.loc[x1.name, y1.name]=diferencia_prom
            if column1=="Acevedo" and column2=="Bogotá":
                return(union)
    wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'stat.csv'),sep="\t",decimal=",",header=wilcoxon_dataframe.columns)
    pvalue_dataframe.to_csv('{}{}'.format(destination_folder,'pvalue.csv'),sep="\t",decimal=",",header=pvalue_dataframe.columns)
    norm_wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'nstat.csv'),sep="\t",decimal=",",header=norm_wilcoxon_dataframe.columns)
    dist_dataframe.to_csv('{}{}'.format(destination_folder,'distancia.csv'),sep="\t",decimal=",",header=dist_dataframe.columns)

In [24]:
wilcoxon_signedrank_matrix_united_corpus_renormaliced_data(DESTINO, method="pratt")
wilcoxon_signedrank_matrix_united_corpus_renormaliced_data(DESTINO, method="wilcox")
wilcoxon_signedrank_matrix_united_corpus_renormaliced_data(DESTINO, method="zsplit")

Bogotá:Acevedo w:18606576.0 p:0.0		2.0 p:0.2893647098119938				3			82		6		

Unnamed: 0,Bogotá,Acevedo
a,3.089060e-02,0.028027
aa,7.612008e-06,0.000015
abajo,5.054715e-05,0.000048
abandona,2.133928e-05,0.000015
abandonadas,8.980459e-07,0.000004
abandonado,8.638346e-06,0.000015
abandonando,9.835741e-07,0.000007
abandonar,1.244435e-05,0.000007
abandonas,1.710564e-06,
abandone,3.763240e-06,0.000007


In [147]:
def wilcoxon_signedrank_smallest_corpus(orgin, method="wilcox", normalize=False):
    destination_folder="/src/data/procesados/wilcoxon_signed/smallest/{}/".format(method)
    words_data = pd.read_csv(
        orgin, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.int32)
    pvalue_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    norm_wilcoxon_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    dist_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
              
    for column1 in words_data:
        x=pd.DataFrame(words_data[column1])
        x = x[getattr(x,column1) != 0]
        xlen = len(x)
        for column2 in words_data:
            y=pd.DataFrame(words_data[column2])
            y= y[getattr(y,column1) != 0]
            ylen = len(y)
            if xlen > ylen:
                words_to_drop = words_data.index - y.index
                x = pd.DataFrame(words_data[column1]).drop(labels=words_to_drop)
                z=pandas.concat([x,y], axis=1).fillna(0)
            elif xlen < ylen:
                words_to_drop = words_data.index - x.index
                y = pd.DataFrame(words_data[column2]).drop(labels=words_to_drop)
                z=pd.concat([x,y], axis=1).fillna(0)
            elif xlen == ylen:
                z=pd.concat([x,y], axis=1).fillna(0)
            if normalize==True:
                destination_folder="/src/data/procesados/wilcoxon_signed/normalized_smallest/{}/".format(method)
                # renormalizar
                totales = pd.DataFrame(z.apply(np.sum), columns=["#TOTALES#"]).transpose()
                z = pd.concat([totales, z])
                z = z.iloc[0:]/z.iloc[0]
                z.drop(labels=["#TOTALES#"], inplace=True)
            if method == "wilcox":
                diferencia = pd.DataFrame(z[column1]-z[column2], columns=["diferencia"])
                ceros = len(diferencia[diferencia.diferencia == 0])
                n = len(diferencia)-ceros
                return(n)
            elif method == "pratt" or method == "zsplit":
                n = len(words_data[column2])
            # s = n*(n+1)/2 este no hay que buscar porque funciona con 4
            s = n*(n+1)/4
            wil, pvalue = wilcoxon(words_data[column1],words_data[column2],zero_method=method)
            nwil = wil/s
            print('\r{}:{} w:{} p:{}'.format(column1,column2,wil,pvalue), end="\t\t")
            wilcoxon_dataframe.loc[column1, column2]=wil
            pvalue_dataframe.loc[column1, column2]=pvalue
            norm_wilcoxon_dataframe.loc[column1, column2]=nwil
            dist_dataframe.loc[column1, column2]=diferencia_prom
 
    wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'stat.csv'),sep="\t",decimal=",",header=wilcoxon_dataframe.columns)
    pvalue_dataframe.to_csv('{}{}'.format(destination_folder,'pvalue.csv'),sep="\t",decimal=",",header=pvalue_dataframe.columns)
    norm_wilcoxon_dataframe.to_csv('{}{}'.format(destination_folder,'nstat.csv'),sep="\t",decimal=",",header=norm_wilcoxon_dataframe.columns)
    dist_dataframe.to_csv('{}{}'.format(destination_folder,'distancia.csv'),sep="\t",decimal=",",header=dist_dataframe.columns)
    
wilcoxon_signedrank_smallest_corpus(DESTINO, method="wilcox", normalize=False)    

ValueError: cannot reindex from a duplicate axis

# Prueba de parametros

In [10]:
def vocabulary_size_proportion(info_file):
    destination_folder="/src/data/procesados/proportions/vocabulary_proportions.csv"
    words_data = pd.read_csv(
        info_file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    
    proportion_dataframe = pd.DataFrame(columns=words_data.columns, dtype=np.float64)
    for column1 in words_data:
        # eliminar ceros
        x=pd.DataFrame(words_data[column1], columns=[column1])
        x=x[getattr(x,column1) != 0]
        xlen = len(x)

        for column2 in words_data:
            y=pd.DataFrame(words_data[column2], columns=[column2])
            y=y[getattr(y,column2) != 0]
            ylen = len(y)
            #Max and min lengths
            maxim = max(xlen,ylen)
            minim = min(xlen,ylen)
            proportion = minim/maxim
            proportion_dataframe.loc[column1, column2]=proportion
    proportion_dataframe.to_csv('{}'.format(destination_folder),sep="\t",decimal=",",header=proportion_dataframe.columns)
    
vocabulary_size_proportion("/src/data/procesados/filtered/Filter20MILData.csv")

In [11]:
def filestatistics(file):
    origin = pd.read_csv(
        file, 
        encoding='utf-8',
        index_col=0,
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    origin = origin.replace(0, np.nan)
    origin = origin.replace(1, np.nan)
    maxim =  origin.stack().max()
    minim = origin.stack().min()
    prom = origin.stack().mean()
    mediana = origin.stack().median()
    stdandar = origin.stack().std(ddof=1)
    return(maxim, minim, prom, mediana, stdandar)

In [12]:
filestatistics("/src/data/procesados/wilcoxon_signed/intersected_corpus/pratt/nstat.csv")

(0.98141837882354643,
 0.4403663233922453,
 0.8093902983137774,
 0.8108697354622103,
 0.09361734749022893)