In [None]:
import pandas as pd
import numpy as np
import time
import pickle
from tqdm.auto import tqdm
import random
import re
import gensim
from gensim.models import Word2Vec
from numpy.linalg import norm

In [None]:
# LISS questions

liss = pd.read_csv('./data/liss_questions.csv', sep = ';')

In [None]:
# TOKENIZATION - lowercase the tokens and remove punctuations

import spacy
from sklearn.feature_extraction.text import CountVectorizer

liss_questions = liss.question_liss

start_time = time.time()

nlp = spacy.load("en_core_web_sm")

processed_speeches = [text for text in tqdm(nlp.pipe(liss_questions, disable = ["ner", "parser", "lemmatizer"]),
                                            total = len(liss_questions))]
tokenized_speeches = [[word.text.lower() for word in text if not word.is_punct] for text in processed_speeches]

# to remove tokens such as '\n'
import re
tokenized_speeches = [[re.sub(r'\W+', '', word) for word in text] for text in tokenized_speeches]

In [None]:
liss['liss_tokenized'] = tokenized_speeches

## Function LISS Similarity - Political Analysis

In [None]:
def liss_similarity_political(decade, party, liss, model, tokenized_speeches_filt):

    df_tokenized_filt = pd.DataFrame({'speech_id': range(0, len(tokenized_speeches_filt))})
    df_tokenized_filt['tokenized_speeches_filt'] = tokenized_speeches_filt
    
    liss_similarity_list = []

    for question_id in tqdm(liss.question_id, total = len(liss)):

        question = liss[liss['question_id'] == question_id].liss_tokenized.iloc[0]

        words_liss = [word for word in question if word in model.wv]
        mean_embedding_liss = np.mean([model.wv[word] for word in words_liss], axis = 0)

        for speech_id in tqdm(df_tokenized_filt.speech_id, total = len(df_tokenized_filt)): 

            speech = df_tokenized_filt[df_tokenized_filt['speech_id'] == speech_id].tokenized_speeches_filt.iloc[0]

            words_filtered = [word for word in speech if word in model.wv]
            mean_embedding_filtered = np.mean([model.wv[word] for word in words_filtered], axis = 0) 

            similarity = np.linalg.norm(np.subtract(mean_embedding_liss, mean_embedding_filtered))
            cosine = np.dot(mean_embedding_liss, mean_embedding_filtered)/(norm(mean_embedding_liss)*norm(mean_embedding_filtered))

            result = {'party':      party,
                      'id_question':question_id,
                      'id_speech':  speech_id,
                      'similarity': similarity,
                      'cosine':     cosine}
            liss_similarity_list.append(result)

    df_liss = pd.DataFrame(liss_similarity_list)
    
    print(f'{party} and {decade}, is done!')
    return(df_liss)

### Run LISS Similarity - Political Analysis function

In [None]:
decades = ['80_90', '90_00', '00_10', '10_21']
parties = ['Conservative', 'Labour']

for decade in decades:
    for party in parties:
    
        model = Word2Vec.load(f"./models/political/word2vec_{party}_{decade}.model") # load model

        filename = f'../data/output/political/tokenized_speeches_filt_{party}_{decade}.csv'

        with open(filename, "rb") as fp:   # Unpickling
            tokenized_speeches_filt = pickle.load(fp) # Load tokenized speeches 

        liss_similarity_political(decade, party, liss, model, tokenized_speeches_filt)