## Estimate Cosine Similarity between members and their own party- Grouping all speeches

In [1]:
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
### Load data

In [4]:
path_file = '/Users/cblanesg/cam.blanes Dropbox/Camila Blanes/Congressional-dataMX/data/02-outcomes/01-policy_positioning/01-text_analysis/wordfish/01_lda/speeches_20k.csv'

In [16]:
data = pd.read_csv(path_file).drop('Unnamed: 0', axis = 1)

## group by topic and legislator

In [18]:
input_data = data[['id_legislador', 'legislatura', 'inc_party','clean_speech', 'topics20k']]

In [104]:
input_data = input_data.groupby(['id_legislador',
                   'legislatura','topics20k',
                   'inc_party']).agg(''.join).reset_index()

In [105]:
input_data2 = input_data.groupby(['id_legislador',
                   'legislatura',
                   'inc_party']).agg(''.join).reset_index()

## Obtain Cosain Similarity between members and median member of party

### Prepare data

In [20]:
coordinadores = pd.read_excel('/Users/cblanesg/cam.blanes Dropbox/Camila Blanes/Congressional-dataMX/data/01-collection_data/00-id_data/04-coordinadores_parlamentarios/coordinadores_parlamentarios.xlsx')
ids_coordinadores = list(coordinadores.id_legislador)

In [108]:
data_members = input_data[~input_data['id_legislador'].isin(ids_coordinadores)]
data_coordinadores = input_data2[input_data2['id_legislador'].isin(ids_coordinadores)].reset_index().drop('index', axis = 1)
data_coordinadores2 = input_data[input_data['id_legislador'].isin(ids_coordinadores)].reset_index().drop('index', axis = 1)

In [32]:
id_legis = '08f7bbd6-fd10-11ea-83d8-acde48001122'

In [33]:
data_legis = data[data['id_legislador'] == id_legis]

In [41]:
counts = data_legis[['legislatura', 'topics20k']].groupby(['topics20k']).count().reset_index()

In [47]:
top_topics = list(counts.sort_values(by=['legislatura'], ascending=False).head(5).topics20k)

In [54]:
data_legis = data_coordinadores[data_coordinadores['id_legislador'] == id_legis]
data_legis = data_legis[data_legis['topics20k'].isin(top_topics)]

In [55]:
def obtain_main_topics(data_legis):
    counts = data_legis[['legislatura', 'topics20k']].groupby(['topics20k']).count().reset_index()
    top_topics = list(counts.sort_values(by=['legislatura'], ascending=False).head(5).topics20k)
    return(top_topics)

In [109]:
all_dataframes = []
for i in tqdm(range(0, len(data_coordinadores))):
    legislatura = data_coordinadores.loc[i]['legislatura']
    party =  data_coordinadores.loc[i]['inc_party']
    leader =  data_coordinadores.loc[i]['id_legislador']
    
    data_legis = data_coordinadores2[data_coordinadores2['id_legislador'] == leader]
    top_topics = obtain_main_topics(data_legis)
    
    #data_legis = data_legis[data_legis['topics20k'].isin(top_topics)]
    subtopics = input_data[input_data['topics20k'].isin(top_topics)]
    subtopics = subtopics.groupby(['id_legislador',
                   'legislatura','inc_party']).agg(''.join).reset_index()
    subset = subtopics[subtopics['legislatura'] == legislatura][subtopics[subtopics['legislatura'] == legislatura]['inc_party'] == party]
    vectorizer = TfidfVectorizer()
    response = vectorizer.fit_transform(subset.clean_speech)
    
    tfidf = pd.DataFrame(response.toarray().transpose(),
                   index=vectorizer.get_feature_names())
    cosine_similarity_matrix = np.transpose(np.asmatrix(tfidf))*np.asmatrix(tfidf)
    
    df_cosine = pd.DataFrame(cosine_similarity_matrix)
    temp_cosine = pd.concat([pd.DataFrame({'id_legislador':subset.id_legislador}).reset_index(drop=True), df_cosine], axis=1)
        
    df_temp = pd.DataFrame({'id_legislador':subset.id_legislador})
    index_party_leader  = df_temp.reset_index()[df_temp.reset_index()['id_legislador'] == leader].index[0]
        
    cosine_out = temp_cosine[['id_legislador', index_party_leader]]
    cosine_out.columns = ['id_legislador', 'cosine_similarity']
    cosine_out['party_leader'] = np.where(cosine_out['id_legislador'] == leader, 1, 0)
    cosine_out['legislatura'] = legislatura
    cosine_out['inc_party'] = party
    all_dataframes.append(cosine_out)
    

100%|██████████| 37/37 [00:04<00:00,  7.61it/s]


In [110]:
agenda_main_topics = pd.concat(all_dataframes)

In [111]:
agenda_main_topics

Unnamed: 0,id_legislador,cosine_similarity,party_leader,legislatura,inc_party
0,015592cc-fd10-11ea-83d8-acde48001122,0.354132,0,64,Partido Encuentro Social
1,02679372-fd10-11ea-83d8-acde48001122,0.145642,0,64,Partido Encuentro Social
2,03bc2ac6-fd10-11ea-83d8-acde48001122,0.148824,0,64,Partido Encuentro Social
3,04e72c02-fd10-11ea-83d8-acde48001122,0.244931,0,64,Partido Encuentro Social
4,0577b2c2-fd10-11ea-83d8-acde48001122,0.235935,0,64,Partido Encuentro Social
...,...,...,...,...,...
126,fe27dc5c-fe1f-11ea-95ca-acde48001122,0.081063,0,60,Partido Acción Nacional
127,fe58123c-fe1f-11ea-95ca-acde48001122,0.248966,0,60,Partido Acción Nacional
128,fe862276-fe1f-11ea-95ca-acde48001122,0.126217,0,60,Partido Acción Nacional
129,feba2706-fe1f-11ea-95ca-acde48001122,0.266683,0,60,Partido Acción Nacional


In [112]:
agenda_main_topics.to_excel('/Users/cblanesg/cam.blanes Dropbox/Camila Blanes/Congressional-dataMX/data/02-outcomes/01-policy_positioning/01-text_analysis/cosine_similarity/party_agenda_leadership_main-topics.xlsx')