# **Imports**

In [1]:
import os
os.chdir("../..")
os.chdir(r"src")

import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from Processing.text_cleaning import *
from GloVe.weights import *
import warnings
warnings.filterwarnings("ignore")
from Axes.projection_functions import *
from Axes.models import *
from Processing.preprocess_parliament import *

os.chdir("../")

/Users/alexandrequeant/Desktop/Travail-TSE


# **Computations**

## Newspapers embeddings

In [10]:
def process_embeddings(file_path):
    # Load the data
    df = standard_opening(file_path, False)
    # Transform the 'sentence_embedding' column
    df['sentence_embedding'] = df['sentence_embedding'].apply(eval).apply(np.array, args=(float,))
    df['count'] = 1
    # Select specific columns and group by 'source'
    df = df[['sentence_embedding', 'source', 'count']].groupby(by='source', as_index=False).sum()
    # Normalize the sentence embeddings by the count using vectorized operation
    df['sentence_embedding'] = df.apply(lambda x: x['sentence_embedding'] / x['count'], axis=1)
    return df

In [24]:
dataframes = []
for i in range(14):
    year = eval('201'+str(i))
    file_path = f'data/sentence_embeddings/sentence_embeddings_{year}.csv'
    dataframes.append(process_embeddings(file_path))

## Axis words embeddings

In [25]:
axes_words = clean(tech + reg + pos + neg, 'unigram')
len(axes_words)

354

In [26]:
def give_embed_anyway(word, model_word, list_of_words):
    if word in filter_model(list_of_words, model_word): 
        return model_word[word]
    else :
        return np.array([0 for i in range(50)], dtype=float)

In [27]:
axes_words_embeddings = []
for i in range(14):
    axes_words_embeddings.append([give_embed_anyway(word, models_w[i], axes_words) for word in axes_words])

In [61]:
df_axes = pd.DataFrame(zip(axes_words, *axes_words_embeddings), columns = ['text', 'embedding 2010', 'embedding 2011', 'embedding 2012', 'embedding 2013', 'embedding 2014', 'embedding 2015', 'embedding 2016', 'embedding 2017', 'embedding 2018', 'embedding 2019', 'embedding 20110', 'embedding 20111','embedding 20112', 'embedding 20113'])

## Final formula

In [82]:
axis = 2

In [83]:
poles = []

for i in range(len(models_w)):
    pos_a = filter_model(pos_1, models_w[i])
    neg_a = filter_model(neg_1, models_w[i])


    pos_b = filter_model(pos_2, models_w[i])
    neg_b = filter_model(neg_2, models_w[i])

    b1 = (barycentre(pos_a, models_w[i])-barycentre(neg_a, models_w[i]))
    b2 = (barycentre(pos_b, models_w[i])-barycentre(neg_b, models_w[i]))

    poles.append([b1,b2])

In [84]:
for k in range(3, 9) :
    #dataframes[k] = dataframes[k].drop(dataframes[k][dataframes[k]['source'] == 'par'].index)
    for i in df_axes.index :
        word = df_axes[df_axes.columns[k+1]][i]
        var = []
        for j in dataframes[k].index :
            diff = (dataframes[k+1]['sentence_embedding'][j]/(np.linalg.norm(dataframes[k+1]['sentence_embedding'][j]))) - (dataframes[k]['sentence_embedding'][j]/(np.linalg.norm(dataframes[k]['sentence_embedding'][j])))
            var.append(np.dot(diff, word)/(np.linalg.norm(poles[k][axis-1])))
        dataframes[k][str(df_axes['text'][i])] = var


dataframes[3]['year'] = 2013
df = dataframes[3]
for k in range(4, 9) :
    dataframes[k]['year'] = int(2010 + k)
    df = pd.concat([df, dataframes[k]]) 

# Visualisation

In [85]:
def see_variation_on_axis(source:str, year:int, df):    

    df = df.loc[df['source'] == source]
    df = df.loc[df['year'] == year]

    l = []
    for word in clean(tech, 'unigram') :
        try : 
            l.append(df[word].tolist()[0])
        except :
            print(word)
    var_tech = dict(zip(clean(tech, 'unigram'), l))
    sorted_var_tech = sorted(var_tech.items(), key = lambda x : x[1], reverse = True)

    l = []
    for word in clean(reg, 'unigram') :
        try : 
            l.append(df[word].tolist()[0])
        except :
            print(word)
    var_reg = dict(zip(clean(reg, 'unigram'), l))
    sorted_var_reg = sorted(var_reg.items(), key = lambda x : x[1], reverse = True)

    l = []
    for word in clean(pos, 'unigram') :
        try : 
            l.append(df[word].tolist()[0])
        except :
            print(word)
    var_pos = dict(zip(clean(pos, 'unigram'), l))
    sorted_var_pos = sorted(var_pos.items(), key = lambda x : x[1], reverse = True)

    l = []
    for word in clean(neg, 'unigram') :
        try : 
            l.append(df[word].tolist()[0])
        except :
            print(word)
    var_neg = dict(zip(clean(neg, 'unigram'), l))
    sorted_var_neg = sorted(var_neg.items(), key = lambda x : x[1], reverse = True)

    return(sorted_var_tech, sorted_var_reg, sorted_var_pos, sorted_var_neg)

In [86]:
def project_variation_on_axis(source:str, year:int, df, number_of_words) :

    fig = make_subplots(rows=2, cols=1)

    if axis == 1 :

        fig.add_trace(
            go.Bar(x=list(dict(see_variation_on_axis(source, year, df)[0][:number_of_words]).keys()), y=list(dict(see_variation_on_axis(source, year, df)[0][:number_of_words]).values()), name = 'var_tech'), row = 1, col = 1)

        fig.add_trace(
            go.Bar(x=list(dict(see_variation_on_axis(source, year, df)[1][:number_of_words]).keys()), y=list(dict(see_variation_on_axis(source, year, df)[1][:number_of_words]).values()), name = 'var_reg'), row = 2, col = 1)

    if axis == 2 :

        fig.add_trace(
            go.Bar(x=list(dict(see_variation_on_axis(source, year, df)[2][:number_of_words]).keys()), y=list(dict(see_variation_on_axis(source, year, df)[2][:number_of_words]).values()), name = 'var_pos'), row = 1, col = 1)

        fig.add_trace(
            go.Bar(x=list(dict(see_variation_on_axis(source, year, df)[3][:number_of_words]).keys()), y=list(dict(see_variation_on_axis(source, year, df)[3][:number_of_words]).values()), name = 'var_neg'), row = 2, col = 1)

    fig.update_layout(height = 500, title_text= str(number_of_words) + " words most responsible for the move of " + str(source) + " towards the respective poles between year " + str(year) + " and " + str(year+1))

    fig.show()

In [87]:
project_variation_on_axis('par', 2017, df, 30)

# **Proof of little variation of matrix A**

## Matrix A

In [13]:
l_1 = []
l_2 = []
l_3 = []
for model in models_words :

    pos_a = filter_model(pos_1, model)
    neg_a = filter_model(neg_1, model)

    pos_b = filter_model(pos_2, model)
    neg_b = filter_model(neg_2, model)

    pos_c = filter_model(pos_3, model)
    neg_c = filter_model(neg_3, model)

    b1 = (barycentre(pos_a, model)-barycentre(neg_a, model))
    b1 = b1 / np.linalg.norm(b1)
    b2 = (barycentre(pos_b, model)-barycentre(neg_b, model))
    b2 = b2 / np.linalg.norm(b2)
    b3 = (barycentre(pos_c, model)-barycentre(neg_c, model))
    b3= b3 / np.linalg.norm(b3)

    l_1.append(b1)
    l_2.append(b2)
    l_3.append(b3)

In [14]:
l1 = [np.dot(l_1[i+1], l_1[i]) for i in range(len(l_1)-1)]
l2 = [np.dot(l_2[i+1], l_2[i]) for i in range(len(l_2)-1)]
l3 = [np.dot(l_3[i+1], l_3[i]) for i in range(len(l_3)-1)]

In [15]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Scatter(x = [2011+i for i in range(13)], y = l1,
                           mode = 'markers+lines' , 
                           showlegend = False, line_shape='spline', name = 'Axis vector cosine variation for tech VS regulation'), row = 1, col = 1)

fig.add_trace(go.Scatter(x = [2011+i for i in range(13)], y = l2,
                           mode = 'markers+lines' , 
                           showlegend = False, line_shape='spline', name = 'Axis vector cosine variation for positive VS negative'), row = 1, col = 2)

fig.add_trace(go.Scatter(x = [2011+i for i in range(13)], y = l3,
                           mode = 'markers+lines' , 
                           showlegend = False, line_shape='spline', name = 'Axis vector cosine variation for cognition VS affect'), row = 1, col = 3)

fig.update_layout(height = 300, width = 800) #, title_text=  " Axis vector cosine variation for tech VS regulation, and for positive VS negative")

fig.show()