# Train pam2vec

In [1]:
# Load packages
import os
import pandas as pd
import numpy as np
import yaml
import nltk
import re
from gensim import models
from stop_words import get_stop_words
from scipy.spatial import distance


In [2]:
with open('config_file.yaml', 'r') as f:
    config = yaml.load(f)

In [9]:
# Read the dataset
df = pd.read_csv(config['ROOT_PATH'] + '/data/' + 'proposals_clean.csv', sep=',', encoding='utf-8')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,proposal,origin,scope,district,category,subcategory,author,author_name,created_at,...,total_neutral_comments,total_negative_comments,rejected_message,language,action,proposal_list,title_ca_action,description_ca_action,title_es_action,description_es_action
0,0,3591.0,citizenship,district,Nou Barris,Economia plural,Un nou lideratge públic,5187,VILLARRASA,2/27/2016,...,0.0,0.0,No hi ha competències i l’Ajuntament no dispos...,es,,,,,,
1,1,2747.0,organization,district,Gràcia,Transició ecològica,Medi ambient i espai públic,6285,,2/17/2016,...,0.0,0.0,No podem comprometre’ns a desenvolupar aquesta...,ca,,,,,,
2,2,3158.0,citizenship,district,Horta - Guinardó,Bon viure,Educació i coneixement,512,Solidaritat Catalana per la Independència,2/19/2016,...,3.0,0.0,,ca,5061.0,"['360', '1648', '3158', '6161']",Analitzar la necessitat i la idoneïtat de l'IE...,Estudiar les necessitats i la situació real pe...,Analizar la necesidad y la idoneidad del IES e...,Estudiar las necesidades y la situación real p...
3,3,8968.0,citizenship,city,,Transició ecològica,Mobilitat sostenible,15512,Archie,4/6/2016,...,0.0,0.0,No podem comprometre’ns a desenvolupar aquesta...,ca,,,,,,
4,4,6774.0,citizenship,district,Gràcia,Transició ecològica,Urbanisme per als barris,15153,Xavier Sisternas,3/24/2016,...,0.0,0.0,,ca,2717.0,"['2391', '2407', '2415', '2418', '2420', '2421...",Pla de millora integral i manteniment de l’esp...,"Identificar, prioritzar i dur a terme actuacio...",Plan de mejora integral y mantenimiento del es...,"Identificar, priorizar y llevar a cabo actuaci..."


In [11]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
len(df)

11448

In [13]:
df.columns

Index(['proposal', 'origin', 'scope', 'district', 'category', 'subcategory',
       'author', 'author_name', 'created_at', 'votes', 'comments', 'url',
       'status', 'title_es', 'title_ca', 'description_es', 'description_ca',
       'group_author', 'total_positive_comments', 'total_neutral_comments',
       'total_negative_comments', 'rejected_message', 'language', 'action',
       'proposal_list', 'title_ca_action', 'description_ca_action',
       'title_es_action', 'description_es_action'],
      dtype='object')

In [14]:
# Create series with texts
text_ca = df.title_ca.drop_duplicates().astype(str) + '. ' + df.description_ca.drop_duplicates().astype(str)  
text_ca_actions = df.title_ca_action.drop_duplicates().astype(str) + '. ' + df.description_ca_action.drop_duplicates().astype(str)
text_ca = text_ca.append(text_ca_actions, ignore_index=True)
text_ca = text_ca.dropna()

text_es = df.title_es.drop_duplicates().astype(str) + '. ' + df.description_es.drop_duplicates().astype(str)  
text_es_actions = df.title_es_action.drop_duplicates().astype(str) + '. ' + df.description_es_action.drop_duplicates().astype(str)
text_es = text_es.append(text_es_actions, ignore_index=True)
text_es = text_es.dropna()

In [15]:
# Functions to delete stopwords and tokenize
def tokenize_ca(text):
    stop_words_ca = get_stop_words('catalan')
    stop_words_ca = stop_words_ca + ['a', 'al', 'als', 'del', 'dels', 'et', 'la', 'que', 'més', 'no', 'nan']

    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = [token.replace(" l '", "l'") for token in tokens]
    tokens = [token.replace("l'", "") for token in tokens]
    tokens = [token.replace("d'", "") for token in tokens]
    tokens = [token.replace("m'", "") for token in tokens]
    tokens = [token.replace("s'", "") for token in tokens]
    tokens = [token.replace(".", "") for token in tokens]
    tokens = [token.replace("-ne", "") for token in tokens]
    tokens = [token.replace("-se", "") for token in tokens]
    tokens = [token.replace("'", "") for token in tokens]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words_ca:
                filtered_tokens.append(token)
                
    return filtered_tokens

def tokenize_es(text):
    stop_words_es = get_stop_words('spanish')
    stop_words_es = stop_words_es + ['nan']
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = [token.replace("'", "") for token in tokens]
    tokens = [token.replace(".", "") for token in tokens]
    tokens = [token.replace(",", "") for token in tokens]
    tokens = [token.replace(";", "") for token in tokens]
    tokens = [token.replace(":", "") for token in tokens]
    tokens = [token.replace("?", "") for token in tokens]
    tokens = [token.replace("!", "") for token in tokens]
    tokens = [token.replace("¡", "") for token in tokens]
    tokens = [token.replace("¿", "") for token in tokens]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words_es:
                filtered_tokens.append(token)
 
    return filtered_tokens


In [16]:
# Tokenize text
text_ca = text_ca.apply(tokenize_ca)
text_es = text_es.apply(tokenize_es)

In [17]:
len(text_ca)

11660

In [18]:
# Train model: word2vec
vector_size = 100
window_size = 4
min_count = 2
workers_num = 10

model_ca = models.Word2Vec(size = vector_size, window = window_size, min_count = min_count, workers = workers_num, sg = 1)
model_ca.build_vocab(text_ca)  # prepare the model vocabulary
model_ca.train(text_ca, total_examples=model_ca.corpus_count, epochs=500)  # train word vectors



model_es = models.Word2Vec(size = vector_size, window = window_size, min_count = min_count, workers = workers_num, sg = 1)
model_es.build_vocab(text_es)  # prepare the model vocabulary
model_es.train(text_es, total_examples=model_es.corpus_count, epochs=500)  # train word vectors

(130244004, 139296000)

In [19]:
model_ca.wv.similarity('bicicleta', 'ciutat')



0.12373164

In [66]:
model_es.wv.similarity('rata', 'gato')

KeyError: "word 'rata' not in vocabulary"

In [21]:
#Functions to compute pam2vec 
def normalize(word_vec):
    norm = np.linalg.norm(word_vec)
    if norm == 0: 
        return word_vec
    return word_vec/norm


def pam2vec(text, lang):
    """text and language and returns the mean of all word2vec of the text"""
    words_list = text
    if lang == 'ca':
        model = model_ca
        words_list = [word for word in words_list if word in model.wv.vocab]
    else:
        model = model_es
        words_list = [word for word in words_list if word in model.wv.vocab]

    if len(words_list) > 0:
        return np.mean(normalize(model[words_list]), axis=0)
    else:
        return np.nan * np.empty(vector_size)

In [23]:
# Compute pam2vec
pam2vec_proposals_ca = []
pam2vec_actions_ca = []
pam2vec_proposals_es = []
pam2vec_actions_es = []

for index, row in df.iterrows():
    proposal_text = str(row['title_ca']) + '. ' + str(row['description_ca'])  
    action_text = str(row['title_ca_action']) + '. ' + str(row['description_ca_action']) 
    proposal_text = tokenize_ca(proposal_text)
    action_text = tokenize_ca(action_text)

    pam2vec_proposals_ca.append(pam2vec(proposal_text, 'ca'))
    pam2vec_actions_ca.append(pam2vec(action_text, 'ca'))
    
    proposal_text = str(row['title_es']) + '. ' + str(row['description_es'])  
    action_text = str(row['title_es_action']) + '. ' + str(row['description_es_action']) 
    proposal_text = tokenize_es(proposal_text)
    action_text = tokenize_es(action_text)

    pam2vec_proposals_es.append(pam2vec(proposal_text, 'es'))
    pam2vec_actions_es.append(pam2vec(action_text, 'es'))



In [59]:
# Compute distances between proposals and actions
for index, row in df.iterrows():
    if pd.isnull(df.loc[index, 'title_es_action']) and pd.isnull(df.loc[index, 'title_ca_action']):
        df.loc[index, 'cos_dist_ca'] = np.nan
        df.loc[index, 'eucl_dist_ca'] = np.nan
        df.loc[index, 'manh_dist_ca'] = np.nan
        
        df.loc[index, 'cos_dist_es'] = np.nan
        df.loc[index, 'eucl_dist_es'] = np.nan
        df.loc[index, 'manh_dist_es'] = np.nan
    else:   
        v1 = pam2vec_proposals_ca[index]
        v2 = pam2vec_actions_ca[index] 
        df.loc[index, 'cos_dist_ca'] = round(distance.cosine(v1, v2), 3)
        df.loc[index, 'eucl_dist_ca'] = round(distance.sqeuclidean(v1, v2), 3)
        df.loc[index, 'manh_dist_ca'] = round(distance.cityblock(v1, v2), 3)

        v1 = pam2vec_proposals_es[index]
        v2 = pam2vec_actions_es[index]
        df.loc[index, 'cos_dist_es'] = round(distance.cosine(v1, v2), 3)
        df.loc[index, 'eucl_dist_es'] = round(distance.sqeuclidean(v1, v2), 3)
        df.loc[index, 'manh_dist_es'] = round(distance.cityblock(v1, v2), 3)    

In [60]:
df.head()

Unnamed: 0,proposal,origin,scope,district,category,subcategory,author,author_name,created_at,votes,...,title_es_action,description_es_action,cos_dist_ca,eucl_dist_ca,jacc_dist_ca,manh_dist_ca,cos_dist_es,eucl_dist_es,jacc_dist_es,manh_dist_es
0,3591.0,citizenship,district,Nou Barris,Economia plural,Un nou lideratge públic,5187,VILLARRASA,2/27/2016,16.0,...,,,,,,,,,,
1,2747.0,organization,district,Gràcia,Transició ecològica,Medi ambient i espai públic,6285,,2/17/2016,8.0,...,,,,,,,,,,
2,3158.0,citizenship,district,Horta - Guinardó,Bon viure,Educació i coneixement,512,Solidaritat Catalana per la Independència,2/19/2016,373.0,...,Analizar la necesidad y la idoneidad del IES e...,Estudiar las necesidades y la situación real p...,0.321,0.012,1.0,0.886,0.305,0.013,1.0,0.916
3,8968.0,citizenship,city,,Transició ecològica,Mobilitat sostenible,15512,Archie,4/6/2016,1.0,...,,,,,,,,,,
4,6774.0,citizenship,district,Gràcia,Transició ecològica,Urbanisme per als barris,15153,Xavier Sisternas,3/24/2016,7.0,...,Plan de mejora integral y mantenimiento del es...,"Identificar, priorizar y llevar a cabo actuaci...",0.474,0.007,1.0,0.66,0.475,0.007,1.0,0.688


In [61]:
df.describe()

Unnamed: 0,proposal,votes,comments,total_positive_comments,total_neutral_comments,total_negative_comments,action,cos_dist_ca,eucl_dist_ca,jacc_dist_ca,manh_dist_ca,cos_dist_es,eucl_dist_es,jacc_dist_es,manh_dist_es
count,11357.0,11357.0,11357.0,11357.0,11357.0,11357.0,8708.0,8617.0,8617.0,8708.0,8617.0,8617.0,8617.0,8708.0,8617.0
mean,5499.708638,15.642159,1.733204,0.49027,0.983622,0.072907,5237.74265,0.281162,0.009912,0.997818,0.708722,0.281115,0.010462,0.983614,0.728579
std,3146.184766,49.972377,5.913829,1.963613,3.289605,0.673665,1042.446625,0.130203,0.01139,0.046663,0.364854,0.133168,0.011351,0.126554,0.373378
min,3.0,0.0,0.0,0.0,0.0,0.0,2585.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2775.0,1.0,0.0,0.0,0.0,0.0,5000.5,0.211,0.004,1.0,0.482,0.208,0.004,1.0,0.505
50%,5500.0,5.0,0.0,0.0,0.0,0.0,5597.0,0.291,0.006,1.0,0.644,0.291,0.007,1.0,0.673
75%,8227.0,15.0,1.0,0.0,1.0,0.0,5856.25,0.369,0.012,1.0,0.879,0.374,0.013,1.0,0.911
max,10946.0,1720.0,337.0,51.0,65.0,30.0,6947.0,0.69,0.167,1.0,3.33,0.687,0.172,1.0,3.326


In [62]:
# Write dataset
df.to_csv(config['ROOT_PATH'] + '/data/' + 'proposals_dists.csv', sep=',', encoding='utf-8')