In [None]:
import sys,os
import csv
import pickle
import scipy
import numpy as np
import json
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import sklearn.model_selection
import sklearn.feature_extraction

# Computing Scores

Use this nb to compute all the pop. scores for all the parties in each nation. The nb uses all the parties in the test sets, plus all the parties ecluded from the training set in the "parties_to_exclude" dictionary from nb "00_generate_bag_of_words.ipynb".

In the __Configuration__ section, fill the "nations_params" dictionary with the kind of classifier used for the corresponding nation, the target score used in the Grid Search and the seed for the random number generators. Check "training_results.json" for possible values.


The ouput data will be saved in .csv format into the "scores" folder. 

__Note__

For Spain we will not compute the score for all the regionalist parties.

# Configuration

In [None]:
nations_params ={
    "AT":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "FR":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "ES":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "DE":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "NL":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT_speeches":{
        "model":"GradientBoosting",
        "target": "AUC",
        "random_state":1
    },
    "IT_manual":{
        "model":"GradientBoosting",
        "target": "F1",
        "random_state":15
    },

}

# Parties data

In [None]:
parties_to_exclude = {
    "IT":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],

    "FR":['The Greens','French Communist Party', "Nouveau Parti Anticapitaliste", "Resistons",'Debout la France'],
    "AT":['Peter Pilz List'],

    "NL":['DENK','Party for the Animals','Reformed Political Party','50Plus','Green Left'],
    "ES":['Amaiur','Andalusian Party','Aragonist Council','Basque Country Unite'\
          ,'Basque Nationalist Party','Basque Solidarity','Canarian Coalition','Catalan Republican Left'\
          ,'Citizens','Commitment-Q','Commitment-We can-It is time','Democratic Convergence of Catalonia'\
          ,'Forum Asturias','Future Yes','Galician Nationalist Bloc','In Tide',"Navarrese People's Union",'Valencian style'],
    "DE":['Pirates']
}

nations = list(nations_params.keys())
numbers = [str(n) for n in range(1000)]
p_train = 0.7
def cut_words(w_list):
    return [w for w in w_list if len(w)>2 and w not in numbers]


# Scores

In [None]:
for nation in nations:
        
    print("reading model for {}...".format(nation))
    model_type, target, random_state = nations_params[nation].values()

    params = pickle.load(open("./models/{0}_{1}_{2}_{3}_best_model_params.pkl".format(nation, model_type, target, random_state),'rb'))
    model = pickle.load(open("./models/{0}_{1}_{2}_{3}_best_model.pkl".format(nation, model_type, target, random_state),"rb"))
    indices_test = pickle.load(open("./models/{0}_{1}_{2}_{3}_test_indices.pkl".format(nation, model_type, target, random_state),'rb'))
    max_thresh = params["threshold"]

    ###########################################
    
    if nation in ["IT_speeches","IT_manual"]:
        data = json.load(open("./datasets/{}_sentences.json".format(nation),"r"))        
    else:
        data = json.load(open("./datasets/{}_manifesto_sentences.json".format(nation),"r"))

    texts = np.array([cut_words(record["clean_text"]) for record in data])
    texts = np.array([" ".join(sent) for sent in texts])
    parties = np.array([record["party"] for record in data])
    years = np.array([record["year"] for record in data])
    orientations = np.array([record["orientation"] for record in data])
    indices = np.arange(0,len(texts)).astype(int)
    
    if nation =="IT_speeches":
        to_exclude = np.array([party in parties_to_exclude["IT"] for party in parties])
    elif nation == "IT_manual":
        to_exclude = np.array([False for sent in texts])
    else:
        to_exclude = np.array([party in parties_to_exclude[nation] for party in parties])

    texts_kept = texts[~to_exclude]
    parties_kept = parties[~to_exclude]
    years_kept = years[~to_exclude]
    orientations = orientations[~to_exclude]

    texts_train,texts_test, parties_train, parties_test, years_train, years_test = sklearn.model_selection.train_test_split(texts_kept,parties_kept, years_kept,random_state=random_state, test_size=1-p_train)
    vectorizer = sklearn.feature_extraction.text.CountVectorizer()
    X_train = (vectorizer.fit_transform(texts_train)>0).astype(int)
    X_test = (vectorizer.transform(texts_test)>0).astype(int)
    

    ############################################
    Y_test = (model.predict_proba(X_test)[:,1]>max_thresh)
    
    print("computing test data scores for {}...".format(nation))    
    global_scores = {}
    global_scores_counts = {}
    score_in_time = {}
    score_in_time_counts = {}

    for party in set(parties_test):
        iii = np.where(parties_test==party)[0]
        global_scores[party] = np.mean(Y_test[iii])
        global_scores_counts[party] = len(Y_test[iii])


    for party, year in zip(parties_test, years_test):
        iii = np.where((parties_test==party) & (years_test==year))[0]
        score_in_time[(party, year)] = np.mean(Y_test[iii])
        score_in_time_counts[(party, year)] = len(Y_test[iii])
    
    if to_exclude.astype(int).sum()!=0:
        print("reading excluded parties scores for {}...".format(nation)) 
        texts_excluded = texts[to_exclude]
        parties_excluded = parties[to_exclude]
        X_excluded = (vectorizer.transform(texts_excluded)>0).astype(int)
        Y_excluded = (model.predict_proba(X_excluded)[:,1]>max_thresh)
        parties_excluded = parties[to_exclude]
        years_excluded = years[to_exclude]

        for party in set(parties_excluded):
            iii = np.where(parties_excluded==party)[0]
            global_scores[party] = np.mean(Y_excluded[iii])
            global_scores_counts[party] = len(Y_excluded[iii])

        for party, year in zip(parties_excluded, years_excluded):
            iii = np.where((parties_excluded==party) & (years_excluded==year))[0]
            score_in_time[(party, year)] = np.mean(Y_excluded[iii])
            score_in_time_counts[(party, year)] = len(Y_excluded[iii])

    ###add party orientation
    party_orientation = {}
    for record in data:
        party = record["party"]
        orientation = record["orientation"]
        party_orientation[party] = orientation
            

    print("saving scores for {}...".format(nation))    

    global_scores_df = pd.DataFrame({"party":global_scores.keys(),"score":global_scores.values()})
    global_scores_df["orientation"] = [party_orientation[party] for party in global_scores_df.party]
    global_scores_df["counts"] = [global_scores_counts[party] for party in global_scores_df.party]

    global_scores_df.to_csv("./scores/global_scores_{}.csv".format(nation), index=False)

    score_in_time_df = pd.DataFrame({"party":[k[0] for k in score_in_time.keys()],"year":[k[1] for k in score_in_time.keys()],"score":score_in_time.values()})
    score_in_time_df["orientation"] = [party_orientation[party] for party in score_in_time_df.party]
    score_in_time_df["counts"] = [score_in_time_counts[(party,year)] for party,year in score_in_time_df[["party", "year"]].values]

    score_in_time_df.to_csv("./scores/scores_in_time_{}.csv".format(nation), index=False)
    
    print()

reading model for IT_manual...


  texts = np.array([cut_words(record["clean_text"]) for record in data])


computing test data scores for IT_manual...
saving scores for IT_manual...

