In [10]:
import sys,os
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import numpy as np
import IPython.display
import pickle
import json

# Data Generation

This notebook generates bag of words and labels from Manifesto datasets of countries. Each dataset is in the form of a json file whose records are single sentences. Each record has the following fields:

- party: the party the sentence belongs to
- year: the year of the manifesto
- orientation: can be Left-wing, Right-wing or Other
- text: the text of the sentence
- cleaned_text: the stemmed text, with stop words removed

Parties in the __parties_to_exclude__ dictionary below are those who will be excluded from the training.

Parties in the __populist_parties__ dictionary are those that we will label as 1 in our final data.

In [16]:
parties_to_exclude = {
    "IT":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],

    "FR":['The Greens','French Communist Party', "Nouveau Parti Anticapitaliste", "Resistons",'Debout la France'],
    "AT":['Peter Pilz List'],

    "NL":['DENK','Party for the Animals','Reformed Political Party','50Plus','Green Left'],
    "ES":['Amaiur','Andalusian Party','Aragonist Council','Basque Country Unite'\
          ,'Basque Nationalist Party','Basque Solidarity','Canarian Coalition','Catalan Republican Left'\
          ,'Citizens','Commitment-Q','Commitment-We can-It is time','Democratic Convergence of Catalonia'\
          ,'Forum Asturias','Future Yes','Galician Nationalist Bloc','In Tide',"Navarrese People's Union",'Valencian style'],
    "DE":['Pirates']
}

populist_parties = {
    "IT":['Northern League', 'PaP', 'M5S', 'Brothers of Italy'],

    "FR":['National Front','Indomitable France'],
    #"AT_old":['Austrian Freedom Party','Alliance for the Future of Austria','Team Stronach for Austria'],
    "AT":['Austrian Freedom Party','Alliance for the Future of Austria','Team Stronach for Austria'],

    "NL":['Party of Freedom','List Pim Fortuyn','Socialist Party','Forum for Democracy'],
    "ES":['We can','In Common We Can',"Vox"],
    "DE":['The Left','Alternative for Germany']
    
}

nations = populist_parties.keys()

#################################


isExist = os.path.exists("./bow_and_labels/")

if not isExist:
    os.makedirs("./bow_and_labels/")

# Manifesto Datasets Bag of words and Labels

In [12]:
for nation in nations:
    data = json.load(open("./datasets/{}_manifesto_sentences.json".format(nation),"r"))

    print("nation = {}".format(nation))

    print("finding all words...")
    counts = {}
    N_sentences = 0
    for record in data:
        clean_text = record["clean_text"]   

        if record["party"] in parties_to_exclude[nation]:continue

        for word in clean_text:
            try: counts[word]+=1
            except KeyError: counts[word]=1

        N_sentences+=1

    print("generating words indices...")

    to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]
    for word in to_del: 
        del counts[word]

    words_list = [w for w in counts.keys()]
    word_index = {}
    for w in words_list: word_index[w] = len(word_index)
    N = len(word_index)



    print("generating bag of words and labels...")

    X = np.zeros((N_sentences,N))
    Y = np.zeros(N_sentences)
    parties= []
    years = []
    orientations = []
    i_party = []
    i=0
    for record in data:
        clean_text = record["clean_text"]
        party = record["party"]
        year = record["year"]
        if party in parties_to_exclude[nation]:continue

        for w in clean_text:
            try: j = word_index[w]
            except KeyError: continue
            X[i,j] = 1

        if party in populist_parties[nation]: Y[i] = 1
        parties.append(party)
        years.append(year)
        orientations.append(record["orientation"])
        i+=1


        
    pickle.dump(X, open("./bow_and_labels/X_{}_sentences.pkl".format(nation), "wb"))
    pickle.dump(Y, open("./bow_and_labels/Y_{}_sentences.pkl".format(nation), "wb"))
    parties = np.array(parties)
    pickle.dump(parties, open("./bow_and_labels/parties_{}_sentences.pkl".format(nation), "wb"))
    years = np.array(years)
    pickle.dump(years, open("./bow_and_labels/years_{}_sentences.pkl".format(nation), "wb"))
    orientations = np.array(orientations)
    pickle.dump(orientations, open("./bow_and_labels/orientations_{}_sentences.pkl".format(nation), "wb"))
    pickle.dump(word_index, open("./bow_and_labels/word_index_{}.pkl".format(nation), "wb"))
                
        
    print()
    
print("done")

nation = IT
finding all words...
generating words indices...
generating bag of words and labels...

nation = FR
finding all words...
generating words indices...
generating bag of words and labels...

nation = AT
finding all words...
generating words indices...
generating bag of words and labels...

nation = NL
finding all words...
generating words indices...
generating bag of words and labels...

nation = ES
finding all words...
generating words indices...
generating bag of words and labels...

nation = DE
finding all words...
generating words indices...
generating bag of words and labels...

done


# Italian Speeches Bag of words and Labels

In [13]:
data = json.load(open("./datasets/IT_speeches_sentences.json","r"))

print("finding all words...")
counts = {}
N_sentences = 0
for record in data:
    clean_text = record["clean_text"]   

    if record["party"] in parties_to_exclude["IT"]:continue

    for word in clean_text:
        try: counts[word]+=1
        except KeyError: counts[word]=1

    N_sentences+=1

print("generating words indices...")

to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]
for word in to_del: 
    del counts[word]

words_list = [w for w in counts.keys()]
word_index = {}
for w in words_list: word_index[w] = len(word_index)
N = len(word_index)



print("generating bag of words and labels...")

X = np.zeros((N_sentences,N))
Y = np.zeros(N_sentences)
parties= []
years = []
i_party = []
orientations = []
i=0
for record in data:
    clean_text = record["clean_text"]
    party = record["party"]
    year = record["year"]
    if party in parties_to_exclude["IT"]:continue

    for w in clean_text:
        try: j = word_index[w]
        except KeyError: continue
        X[i,j] = 1

    if party in populist_parties["IT"]: Y[i] = 1
        
    parties.append(party)
    years.append(year)
    orientations.append(record["orientation"])
        

    i+=1


pickle.dump(X, open("./bow_and_labels/X_IT_speeches_sentences.pkl", "wb"))
pickle.dump(Y, open("./bow_and_labels/Y_IT_speeches_sentences.pkl", "wb"))
parties = np.array(parties)
pickle.dump(parties, open("./bow_and_labels/parties_IT_speeches_sentences.pkl", "wb"))
years = np.array(years)
pickle.dump(years, open("./bow_and_labels/years_IT_speeches_sentences.pkl", "wb"))
orientations = np.array(orientations)
pickle.dump(orientations, open("./bow_and_labels/orientations_IT_speeches_sentences.pkl", "wb"))
pickle.dump(word_index, open("./bow_and_labels/word_index_IT_speeches.pkl", "wb"))

print()


finding all words...
generating words indices...
generating bag of words and labels...



# Italian Manual Annotations Bag of words and Labels

In [14]:
data = json.load(open("./datasets/IT_manual_sentences.json","r"))

print("finding all words...")
counts = {}
N_sentences = 0
for record in data:
    clean_text = record["clean_text"]   
    for word in clean_text:
        try: counts[word]+=1
        except KeyError: counts[word]=1

    N_sentences+=1

print("generating words indices...")

to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]
for word in to_del: 
    del counts[word]

words_list = [w for w in counts.keys()]
word_index = {}
for w in words_list: word_index[w] = len(word_index)
N = len(word_index)



print("generating bag of words and labels...")

X = np.zeros((N_sentences,N))
Y = np.zeros(N_sentences)
parties = []
years = []
i_party = []
orientations = []
i=0
for record in data:
    clean_text = record["clean_text"]
    party = record["party"]
    year = record["year"]

    for w in clean_text:
        try: j = word_index[w]
        except KeyError: continue
        X[i,j] = 1

    if record["is_populist"]: Y[i] = 1
    parties.append(party)
    years.append(year)
    orientations.append(record["orientation"])

    i+=1


pickle.dump(X, open("./bow_and_labels/X_IT_manual_sentences.pkl", "wb"))
pickle.dump(Y, open("./bow_and_labels/Y_IT_manual_sentences.pkl", "wb"))
parties = np.array(parties)
pickle.dump(parties, open("./bow_and_labels/parties_IT_manual_sentences.pkl", "wb"))
years = np.array(years)
pickle.dump(years, open("./bow_and_labels/years_IT_manual_sentences.pkl", "wb"))
orientations = np.array(orientations)
pickle.dump(years, open("./bow_and_labels/orientations_IT_manual_sentences.pkl", "wb"))
pickle.dump(word_index, open("./bow_and_labels/word_index_IT_manual.pkl", "wb"))

print()


finding all words...
generating words indices...
generating bag of words and labels...

