# Cálculos de HSIC

In [1]:
import sys
# Add the ptdraft folder path to the sys.path list
sys.path.append('/src')

from IPython.core.display import display, HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
import pandas
import re
from HSIC.HSIC import HSIC_no_pval
import time
import matplotlib.pyplot as plt

In [2]:
WORDS_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_FREQ_CLEAN.csv"
CITIES_ORIGIN = "/src/data/procesados/geo/filter_cities_coordinates.csv"
DESTINO = "/src/data/procesados/HSIC/HSIC_RANK_NORM_SIN_PVALUE_FREQ.csv"

In [3]:
def escribir_cabezote_archivo(destino, fieldnames):
    with open(destino, 'w') as csvfile:
        writer = fieldnames.to_csv(csvfile,sep="\t",header=fieldnames.columns,decimal=",")
        
def escribir_datos_archivo(destino, values):
    with open(destino, 'a') as csvfile:
        writer = values.to_csv(csvfile,sep="\t",header=False,decimal=",") 

In [188]:
def hsic_no_pvalue_to_file(words_file, cities_file, destino):
    words = pandas.read_csv(
        words_file, 
        encoding='utf-8',
        chunksize=1,
        index_col=0,
        skiprows=[1],
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    cities = pandas.read_csv(
        cities_file, 
        encoding='utf-8',
        sep ="\t",
        decimal=",",
        quotechar='"',
        usecols=['ciudad', 'Latitud','Longitud'],
        index_col =0
    ).transpose()

    
    count = 0
    
    for chunk  in words:
        start_time = time.time()
        data = pandas.concat([cities,chunk]).transpose()
        word_vector = data.iloc[0:, 2:3].as_matrix()
        #Linealizar
        linear_word_vector = np.log(word_vector+1)
        cities_vector = data.iloc[0:, 0:2].as_matrix()
        hsic=HSIC_no_pval(cities_vector,word_vector, kernelX="Gaussian", kernelY="Gaussian")
        hsic_lin=HSIC_no_pval(cities_vector,linear_word_vector, kernelX="Gaussian", kernelY="Gaussian")
        basic_header = pandas.DataFrame({
            'HSIC':pandas.Series([], dtype='float'),
            'HSIC_LIN':pandas.Series([], dtype='float')
        })
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC']=hsic
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC_LIN']=hsic_lin
        if count == 0:
            escribir_cabezote_archivo(destino, basic_header)
        else:
            escribir_datos_archivo(destino, basic_header)
        count = count + 1
        elapsed_time = time.time() - start_time
        print ('\rDuracion:{}\t procesados:{}\t palabra:{}\t'.format(elapsed_time,count, chunk.index.values[0]),end='\t')
    
    return(hsic, hsic_lin)
            
    
    """print('\n \n Frito el pollo')"""
        

In [189]:
# Con frecuencias absolutas 

WORDS_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_FREQ_CLEAN.csv"
CITIES_ORIGIN = "/src/data/procesados/geo/filter_cities_coordinates.csv"
DESTINO = "/src/data/procesados/HSIC/HSIC_SIN_PVALUE_FREQ.csv"

In [190]:
hsic_no_pvalue_to_file(WORDS_ORIGIN, CITIES_ORIGIN, DESTINO)

Duracion:0.06232762336730957	 procesados:130464	 palabra:casaenelagua								bas		licaigual		



Duracion:0.057520389556884766	 procesados:413040	 palabra:tardata				ntoss				y		te			iaz		tiaportiypensarasenmiaunqueesteslejos		u				hablandohermosofiebreanochededicarimagenes		atarara		

(7.1257991320446221e-05, 7.1257991320446221e-05)

In [191]:
# Con frecuencias realtivas 

WORDS_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_RELATIVE_FREQ_CLEAN.csv"
DESTINO = "/src/data/procesados/HSIC/HSIC_SIN_PVALUE_REL_FREQ.csv"
def hsic_no_pvalue_to_file(words_file, cities_file, destino):
    words = pandas.read_csv(
        words_file, 
        encoding='utf-8',
        chunksize=1,
        index_col=0,
        skiprows=[1],
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    cities = pandas.read_csv(
        cities_file, 
        encoding='utf-8',
        sep ="\t",
        decimal=",",
        quotechar='"',
        usecols=['ciudad', 'Latitud','Longitud'],
        index_col =0
    ).transpose()

    
    count = 0
    
    for chunk  in words:
        start_time = time.time()
        data = pandas.concat([cities,chunk]).transpose()
        word_vector = data.iloc[0:, 2:3].as_matrix()
        #Linealizar
        linear_word_vector = np.log(word_vector*100000000)
        cities_vector = data.iloc[0:, 0:2].as_matrix()
        hsic=HSIC_no_pval(cities_vector,word_vector, kernelX="Gaussian", kernelY="Gaussian")
        hsic_lin=HSIC_no_pval(cities_vector,linear_word_vector, kernelX="Gaussian", kernelY="Gaussian")
        basic_header = pandas.DataFrame({
            'HSIC':pandas.Series([], dtype='float'),
            'HSIC_LIN':pandas.Series([], dtype='float')
        })
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC']=hsic
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC_LIN']=hsic_lin
        if count == 0:
            escribir_cabezote_archivo(destino, basic_header)
        else:
            escribir_datos_archivo(destino, basic_header)
        count = count + 1
        elapsed_time = time.time() - start_time
        print ('\rDuracion:{}\t procesados:{}\t palabra:{}\t'.format(elapsed_time,count, chunk.index.values[0]),end='\t')
    
    return(hsic, hsic_lin)
            
    
    """print('\n \n Frito el pollo')"""


In [192]:
hsic_no_pvalue_to_file(WORDS_ORIGIN, CITIES_ORIGIN, DESTINO)

Duracion:0.056249141693115234	 procesados:130464	 palabra:casaenelagua								as		licaigual		



Duracion:0.05717825889587402	 procesados:413040	 palabra:tardata				entoss						te						z		iaportiypensarasenmiaunqueesteslejos		tu			hablandohermosofiebreanochededicarimagenes		tatarara				

(2.0840356939155631e-05, 0.0)

In [193]:
# Con ranks 

WORDS_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_RANK.csv"
DESTINO = "/src/data/procesados/HSIC/HSIC_SIN_PVALUE_RANK.csv"

def hsic_no_pvalue_to_file(words_file, cities_file, destino):
    words = pandas.read_csv(
        words_file, 
        encoding='utf-8',
        chunksize=1,
        index_col=0,
        skiprows=[1],
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    cities = pandas.read_csv(
        cities_file, 
        encoding='utf-8',
        sep ="\t",
        decimal=",",
        quotechar='"',
        usecols=['ciudad', 'Latitud','Longitud'],
        index_col =0
    ).transpose()

    
    count = 0
    
    for chunk  in words:
        start_time = time.time()
        data = pandas.concat([cities,chunk]).transpose()
        word_vector = data.iloc[0:, 2:3].as_matrix()
        #Linealizar
        linear_word_vector = np.log(word_vector)
        cities_vector = data.iloc[0:, 0:2].as_matrix()
        hsic=HSIC_no_pval(cities_vector,word_vector, kernelX="Gaussian", kernelY="Gaussian")
        hsic_lin=HSIC_no_pval(cities_vector,linear_word_vector, kernelX="Gaussian", kernelY="Gaussian")
        basic_header = pandas.DataFrame({
            'HSIC':pandas.Series([], dtype='float'),
            'HSIC_LIN':pandas.Series([], dtype='float')
        })
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC']=hsic
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC_LIN']=hsic_lin
        if count == 0:
            escribir_cabezote_archivo(destino, basic_header)
        else:
            escribir_datos_archivo(destino, basic_header)
        count = count + 1
        elapsed_time = time.time() - start_time
        print ('\rDuracion:{}\t procesados:{}\t palabra:{}\t'.format(elapsed_time,count, chunk.index.values[0]),end='\t')
    
    return(hsic, hsic_lin)
            
    
    """print('\n \n Frito el pollo')"""

In [194]:
hsic_no_pvalue_to_file(WORDS_ORIGIN, CITIES_ORIGIN, DESTINO)

Duracion:0.0830388069152832	 procesados:7310	 palabra:ocupa				imiento			

KeyboardInterrupt: 

In [None]:
# Con ranks normalizados

# Corregir #

WORDS_ORIGIN = "/src/data/procesados/clean/TWITTER_COLOMBIA_RANK_NORM.csv"
DESTINO = "/src/data/procesados/HSIC/HSIC_SIN_PVALUE_RANK_NORM.csv"

def hsic_no_pvalue_to_file(words_file, cities_file, destino):
    words = pandas.read_csv(
        words_file, 
        encoding='utf-8',
        chunksize=1,
        index_col=0,
        skiprows=[1],
        sep ="\t",
        decimal=",",
        quotechar='"'
    )
    cities = pandas.read_csv(
        cities_file, 
        encoding='utf-8',
        sep ="\t",
        decimal=",",
        quotechar='"',
        usecols=['ciudad', 'Latitud','Longitud'],
        index_col =0
    ).transpose()

    
    count = 0
    
    for chunk  in words:
        start_time = time.time()
        data = pandas.concat([cities,chunk]).transpose()
        word_vector = data.iloc[0:, 2:3].as_matrix()
        #Linealizar
        linear_word_vector = np.log(word_vector*100000000)
        cities_vector = data.iloc[0:, 0:2].as_matrix()
        hsic=HSIC_no_pval(cities_vector,word_vector, kernelX="Gaussian", kernelY="Gaussian")
        hsic_lin=HSIC_no_pval(cities_vector,linear_word_vector, kernelX="Gaussian", kernelY="Gaussian")
        basic_header = pandas.DataFrame({
            'HSIC':pandas.Series([], dtype='float'),
            'HSIC_LIN':pandas.Series([], dtype='float')
        })
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC']=hsic
        basic_header.loc['{}'.format(chunk.index[0]),'HSIC_LIN']=hsic_lin
        if count == 0:
            escribir_cabezote_archivo(destino, basic_header)
        else:
            escribir_datos_archivo(destino, basic_header)
        count = count + 1
        elapsed_time = time.time() - start_time
        print ('\rDuracion:{}\t procesados:{}\t palabra:{}\t'.format(elapsed_time,count, chunk.index.values[0]),end='\t')
    
    return(hsic, hsic_lin)
            
    
    """print('\n \n Frito el pollo')"""

In [None]:
hsic_no_pvalue_to_file(WORDS_ORIGIN, CITIES_ORIGIN, DESTINO)