In [1]:
import sys,os
import csv
import pickle
import scipy
import numpy as np
import json
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd

# Computing Scores

Use this nb to compute all the pop. scores for all the parties in each nation. The nb uses all the parties in the test sets, plus all the parties ecluded from the training set in the "parties_to_exclude" dictionary from nb "00_generate_bag_of_words.ipynb".

In the __Configuration__ section, fill the "nations_params" dictionary with the kind of classifier used for the corresponding nation, the target score used in the Grid Search and the seed for the random number generators. Check "training_results.json" for possible values.


The ouput data will be saved in .csv format into the "scores" folder. 

__Note__

For Spain we will not compute the score for all the regionalist parties.

# Configuration

In [2]:
nations_params ={
    "AT":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "FR":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "ES":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "DE":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "NL":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT_speeches":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT_manual":{
        "model":"GradientBoosting",
        "target": "F1",
        "random_state":1
    },

}

# Parties data

In [3]:
parties_to_exclude = {
    "IT":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],
    "IT_speeches":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],
    "IT_manual":[],
    "FR":['The Greens','French Communist Party', "Nouveau Parti Anticapitaliste", "Resistons",'Debout la France'],
    "AT":['Peter Pilz List'],
    "NL":['DENK','Party for the Animals','Reformed Political Party','50Plus','Green Left'],
    "ES":["Citizens"],
    "DE":['Pirates']
}



spanish_regionalist_parties = ['Amaiur',
                 'Andalusian Party',
                 'Aragonist Council',
                 'Basque Country Unite',
                 'Basque Nationalist Party',
                 'Basque Solidarity',
                 'Canarian Coalition',
                 'Catalan Republican Left',
                 'Citizens',
                 'Commitment-Q',
                 'Commitment-We can-It is time',
                 'Democratic Convergence of Catalonia',
                 'Forum Asturias',
                 'Future Yes',
                 'Galician Nationalist Bloc',
                 'In Tide',
                 "Navarrese People's Union",
                 'Valencian style']


nations = list(nations_params.keys())
numbers = [str(n) for n in range(1000)]


# Scores

In [4]:
for nation in nations:
        
    print("reading model for {}...".format(nation))
    model_type, target, random_state = nations_params[nation].values()

    params = pickle.load(open("./models/{0}_{1}_{2}_{3}_best_model_params.pkl".format(nation, model_type, target, random_state),'rb'))
    model = pickle.load(open("./models/{0}_{1}_{2}_{3}_best_model.pkl".format(nation, model_type, target, random_state),"rb"))
    indexes_test = pickle.load(open("./models/{0}_{1}_{2}_{3}_test_indexes.pkl".format(nation, model_type, target, random_state),'rb'))
    max_thresh = params["threshold"]
    
    print("reading test data for {}...".format(nation))
    X = pickle.load(open("./bow_and_labels/X_{}_sentences.pkl".format(nation), "rb"))[indexes_test]
    Y = pickle.load(open("./bow_and_labels/Y_{}_sentences.pkl".format(nation), "rb"))[indexes_test]
    parties = pickle.load(open("./bow_and_labels/parties_{}_sentences.pkl".format(nation), "rb"))[indexes_test]
    years = pickle.load(open("./bow_and_labels/years_{}_sentences.pkl".format(nation), "rb"))[indexes_test]
    
    
    print("computing test data scores for {}...".format(nation))    
    s = (model.predict_proba(X)[:,1]>max_thresh)
    global_scores = {}
    global_scores_counts = {}
    score_in_time = {}
    score_in_time_counts = {}

    for party in set(parties):
        iii = np.where(parties==party)[0]
        global_scores[party] = np.mean(s[iii])
        global_scores_counts[party] = len(s[iii])

    for party, year in zip(parties, years):
        iii = np.where((parties==party) & (years==year))[0]
        score_in_time[(party, year)] = np.mean(s[iii])
        score_in_time_counts[(party, year)] = len(s[iii])
    
    
    print("reading excluded parties scores for {}...".format(nation))    

    if "speeches" in nation or "manual" in nation:
        data = json.load(open("./datasets/{}_sentences.json".format(nation),"r"))
    else:
        data = json.load(open("./datasets/{}_manifesto_sentences.json".format(nation),"r"))

    ###add party orientation
    party_orientation = {}
    for record in data:
        party = record["party"]
        orientation = record["orientation"]
        party_orientation[party] = orientation
        
    print("finding all words for {} excluded parties...".format(nation))
    
    if nation!="ES": excluded_parties = parties_to_exclude[nation]
    else: excluded_parties = parties_to_exclude[nation] + spanish_regionalist_parties
    
    if len(excluded_parties)!=0:
        counts = {}
        N_sentences = 0
        for record in data:
            clean_text = record["clean_text"]   
            clean_text = [w for w in clean_text if w not in numbers]

            if record["party"] in excluded_parties:
                N_sentences +=1
                continue

            for word in clean_text:
                try: counts[word]+=1
                except KeyError: counts[word]=1
                    
        print("generating words indices for {} excluded parties...".format(nation))

        to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]
        for word in to_del: 
            del counts[word]

        words_list = [w for w in counts.keys()]
        word_index = {}
        for w in words_list: word_index[w] = len(word_index)
        N = len(word_index)


        X_excluded = np.zeros((N_sentences,N))
        parties_excluded= []
        years_excluded = []

        i=0
        for record in data:
            clean_text = record["clean_text"]
            clean_text = [w for w in clean_text if w not in numbers]
            
            party = record["party"]
            year = record["year"]
            if party not in parties_to_exclude[nation]:continue

            for w in clean_text:
                try: j = word_index[w]
                except KeyError: continue
                X_excluded[i,j] = 1

            parties_excluded.append(party)
            years_excluded.append(year)
            i+=1
        parties_excluded = np.array(parties_excluded)
        years_excluded = np.array(years_excluded)

        print("computing excluded parties scores for {}...".format(nation))    


        s_excluded = (model.predict_proba(X_excluded)[:,1]>max_thresh)


        for party in set(parties_excluded):
            iii = np.where(parties_excluded==party)[0]
            global_scores[party] = np.mean(s_excluded[iii])
            global_scores_counts[party] = len(s_excluded[iii])

        for party, year in zip(parties_excluded, years_excluded):
            iii = np.where((parties_excluded==party) & (years_excluded==year))[0]
            score_in_time[(party, year)] = np.mean(s_excluded[iii])
            score_in_time_counts[(party, year)] = len(s_excluded[iii])

    else:
        print("no excluded parties, skipping..")

    print("saving scores for {}...".format(nation))    

    global_scores_df = pd.DataFrame({"party":global_scores.keys(),"score":global_scores.values()})
    global_scores_df["orientation"] = [party_orientation[party] for party in global_scores_df.party]
    global_scores_df["counts"] = [global_scores_counts[party] for party in global_scores_df.party]

    global_scores_df.to_csv("./scores/global_scores_{}.csv".format(nation), index=False)

    score_in_time_df = pd.DataFrame({"party":[k[0] for k in score_in_time.keys()],"year":[k[1] for k in score_in_time.keys()],"score":score_in_time.values()})
    score_in_time_df["orientation"] = [party_orientation[party] for party in score_in_time_df.party]
    score_in_time_df["counts"] = [score_in_time_counts[(party,year)] for party,year in score_in_time_df[["party", "year"]].values]

    score_in_time_df.to_csv("./scores/scores_in_time_{}.csv".format(nation), index=False)
    
    print()

reading model for IT_speeches...
reading test data for IT_speeches...
computing test data scores for IT_speeches...
reading excluded parties scores for IT_speeches...
finding all words for IT_speeches excluded parties...
generating words indices for IT_speeches excluded parties...
computing excluded parties scores for IT_speeches...
saving scores for IT_speeches...

reading model for IT_manual...
reading test data for IT_manual...
computing test data scores for IT_manual...
reading excluded parties scores for IT_manual...
finding all words for IT_manual excluded parties...
no excluded parties, skipping..
saving scores for IT_manual...

