In [None]:
import pandas as pd
import numpy as np
import time
import pickle
from tqdm.auto import tqdm
import random
import re
import gensim
from gensim.models import Word2Vec

In [None]:
word_list = pd.read_csv('./data/male_female_words.csv', sep = ';')

word_list = word_list.male.tolist() + word_list.female.tolist()
word_list = [x for x in word_list if str(x) != 'nan']

## Tokenization function

In [None]:
def tokenize_speeches(df, decade, sample_size):
    
    df = df[df['speech'].map(type) == str]
    
    party = df.party.iloc[0]
    
    list_samp_year = []

    for year in df.year.unique():
        if (year == 2021) & (party == 'Labour'):
            list_samp_year.append(df[df['year'] == year])
        else:
            list_samp_year.append(df[df['year'] == year].sample(n = sample_size))

    df_samp_year = pd.concat(list_samp_year)

    start_time = time.time()

    samp_list = []

    for i in tqdm(range(0, len(df_samp_year)), total = len(df_samp_year)):
        if any(word in df_samp_year.speech.iloc[i].lower() for word in word_list):
            samp_list.append(df_samp_year.iloc[i])

    df_samp_year_filt = pd.DataFrame(samp_list)
    
    # TOKENIZATION - lowercase the tokens and remove punctuations

    import spacy
    from sklearn.feature_extraction.text import CountVectorizer

    speeches = df_samp_year_filt.speech

    start_time = time.time()

    nlp = spacy.load("en_core_web_sm")

    processed_speeches = [text for text in tqdm(nlp.pipe(speeches, disable = ["ner", "parser", "lemmatizer"]),
                                                total = len(speeches))]

    tokenized_speeches = [[word.text.lower() for word in text if not word.is_punct] for text in processed_speeches]

    # to remove tokens such as '\n'
    tokenized_speeches = [[re.sub(r'\W+', '', word) for word in text] for text in tokenized_speeches]

    # filter tokenized speeches
    tokenized_speeches_filt = []

    for i in range(0, len(tokenized_speeches)):
        if any(word in tokenized_speeches[i] for word in word_list):
                tokenized_speeches_filt.append(tokenized_speeches[i])
    
    # save filtered tokenized speeches    
    with open('./data/output/political/tokenized_speeches_filt_' + party + '_' + decade + '.csv', 'wb') as fp:   #Pickling
        pickle.dump(tokenized_speeches_filt, fp)
    
    print(decade + ' + ' + party + ' is done!')
                     
    return(tokenized_speeches_filt)

### Run tokenization

In [None]:
decades = ['80_90', '90_00', '00_10', '10_21']
parties = ['Conservative', 'Labour']

n_sample = 10000

for decade in decades:
    df = pd.read_csv('./data/speeches_' + decade + '.csv', sep = ',')
    
    for party in parties:
        df_party = df[df['party'] == party]
    
        tokenize_speeches(df_party, decade, n_sample)

## Model save function

In [None]:
def word2vec_model(decade, party):
    
    filename = './data/output/political/tokenized_speeches_filt_' + party + '_' + decade + '.csv'

    with open(filename, "rb") as fp:   # Unpickling
        tokenized_speeches_filt = pickle.load(fp)
    
    # MODEL
    import gensim
    from gensim.models import Word2Vec

    SIZE      = 300 # dimensions of the embeddings
    SG        = 1   # skip-gram
    WINDOW    = 10  # window size
    N_WORKERS = 1   # number of workers
    MIN_COUNT = 5

    start_time = time.time()

    model = Word2Vec(vector_size = SIZE,
                     sg = SG,
                     window = WINDOW, 
                     min_count = MIN_COUNT,
                     workers = N_WORKERS)

    model.build_vocab(tokenized_speeches_filt)
    
    model.train(tokenized_speeches_filt,
                total_examples = model.corpus_count,
                epochs = model.epochs)
    
    # save model
    model.save("./models/political/word2vec_" + party + '_' + decade + ".model")
    
    print(decade + ' + ' + party + ' model is done!')

### Run model save

In [None]:
### SAVE MODEL

decades = ['80_90', '90_00', '00_10', '10_21']
parties = ['Conservative', 'Labour']

for decade in decades:
    for party in parties:
        
        word2vec_model(decade, party)