# **Import**

In [1]:
import os
os.chdir("../..")
os.chdir(r"src")

import json
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from Processing.text_cleaning import *
from GloVe.weights import *
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from Axes.projection_functions import *
from Axes.models import *
from Axes.filter_words import *
from Processing.preprocess_parliament import *

os.chdir("../")

/Users/alexandrequeant/Desktop/Travail-TSE


# **Words projections with weights**

In [2]:
def process_year_data(year, model_words_year):
    with open(f'data/words/Finalwords_{year}.json') as f:
        words_year = json.load(f)
        
    weights_year = get_weights_word2vec(words_year, a=1e-3)

    with open(f'data/vocabs/vocab_{year}.json') as f:
        vocab_year = json.load(f)
        
    vocab_embed_year = [weights_year[i] * model_words_year[i] for i in vocab_year]

    df_words_year = pd.DataFrame(zip(vocab_year, vocab_embed_year), columns=['text', 'embedding'])

    axis_v1 = axis_vector(pos_1, neg_1, model_words_year)
    axis_v2 = axis_vector(pos_2, neg_2, model_words_year)

    df_words_year['cos axe 1'] = df_words_year['text'].apply(cosine_with_axis, axis_v=axis_v1, model_sentences=model_words_year)
    df_words_year['cos axe 2'] = df_words_year['text'].apply(cosine_with_axis, axis_v=axis_v2, model_sentences=model_words_year)

    df_words_year['year'] = year if year <= 2019 else year - 18090  # Adjust year for 20110 and beyond
    
    return df_words_year

# Assuming you have a dictionary `model_words` with keys as years and values as the corresponding model for that year
all_dfs = []
for i in range(14):  # Adjust range as needed
    year = eval('201'+str(i))
    df_year = process_year_data(year, models_w[i])
    all_dfs.append(df_year)

# **Real embedding variations**

In [4]:
def var_embed_real(word:str, df1, df2, cos_axe:str):
    try :
        return(df2.loc[df2['text'] == word][cos_axe].values[0] - df1.loc[df1['text'] == word][cos_axe].values[0])
    except :
        return None

In [5]:
for i in range(1, len(all_dfs)):
    current_df = all_dfs[i]
    previous_df = all_dfs[i-1]
    
    for cos_axe in ['cos axe 1', 'cos axe 2']:
        var_column_name = f'var {cos_axe}'
        current_df[var_column_name] = current_df['text'].apply(var_embed_real, 
                                                               df1=previous_df, 
                                                               df2=current_df, 
                                                               cos_axe=cos_axe)

In [6]:
all_dfs[6]

Unnamed: 0,text,embedding,cos axe 1,cos axe 2,year,var cos axe 1,var cos axe 2
0,lesley,[[[[[ 0.85344989 -0.76419199 0.69369632 -0.30...,0.242352,0.125835,2016,0.008122,-0.088206
1,itinerari,[[[[[ 5.53582532e-01 7.42409179e-01 -2.306312...,0.069193,0.308554,2016,0.146832,0.327236
2,stag,[[[[[-0.1054389 -0.72052369 -0.48361916 -0.20...,-0.018865,-0.014295,2016,0.194558,0.021773
3,ingeni,[[[[[ 4.32699895e-01 -2.24511721e-01 8.596688...,0.010645,-0.064990,2016,0.130969,0.013774
4,morein,[[[[[-0.6001104 -0.44765165 0.40003361 0.40...,0.025486,-0.027517,2016,-0.089606,0.050730
...,...,...,...,...,...,...,...
19495,cosmet,[[[[[ 1.03527253 0.32627772 -2.09583277 -0.59...,-0.272678,0.187423,2016,-0.206885,0.073061
19496,systema,[[[[[-0.43356704 0.55048663 0.24307671 -1.00...,-0.213749,0.096239,2016,-0.237050,0.032960
19497,comet,[[[[[ 0.83031857 0.0263708 0.05200302 -0.30...,0.065103,0.010565,2016,0.184930,-0.138672
19498,tenth,[[[[[ 1.08642054 0.30102415 0.53701357 0.40...,0.305026,0.170527,2016,0.163375,0.111473


## New filter

In [7]:
events_keywords = list(set(clean(events_keywords, 'unigram')))
new_topics = list(set(clean(new_topics, 'unigram')))

In [8]:
def is_in_keywords(word):
    if word in new_topics:
        return True
    if word in events_keywords:
        return True
    return False

In [9]:
def process_yearly_data(df, year):
     # Load the words from the file
    with open(f'data/words/Finalwords_{year}.json') as f:
        words = json.load(f)
    
    # Calculate word counts
    word_counts = Counter(words)
    
    # Apply the word count to the dataframe
    df['word count'] = df['text'].apply(lambda word: word_counts.get(word, 0))
    
    # Filter rows where 'word count' is greater than 100
    df_filtered = df[df['word count'] > 100]
    
    # Apply the check for 'in keywords'
    df_filtered['in keywords'] = df_filtered['text'].apply(is_in_keywords)
    
    # Filter by 'in keywords'
    df_keywords = df_filtered[df_filtered['in keywords']]
    
    return df_keywords

# Assuming you have defined is_in_keywords function somewhere
# Iterate over the range of years
for i in range(14):  # Adjust the range according to your needs
    year = eval('201'+str(i))
    all_dfs[i] = process_yearly_data(all_dfs[i], year)

# **Visualization**

In [10]:
def get_top_variations(df_keywords, axis, number):
    """Sorts the dataframe by the specified axis and gets the top number variations."""
    var_up = df_keywords.sort_values(by=[f'var cos axe {axis}'], ascending=False).head(number)[['text', 'year', f'var cos axe {axis}']]
    var_down = df_keywords.sort_values(by=[f'var cos axe {axis}'], ascending=True).head(number)[['text', 'year', f'var cos axe {axis}']]
    return var_up, var_down

In [21]:
def vizualize_top_variations(df_keywords, axis_1, axis_2=None, variation_1 = 'up', variation_2 = 'down', number=20):

    var_up_1, var_down_1 = get_top_variations(df_keywords, axis_1, number)

    if axis_2:
        var_up_2, var_down_2 = get_top_variations(df_keywords, axis_2, number)
    else:
        var_up_2, var_down_2 = var_up_1, var_down_1
        axis_2 = axis_1

    if variation_1 == 'down':
        var_up_1 = var_down_1
    if variation_2 == 'up':
        var_down_2 = var_up_2

    fig = make_subplots(rows=2, cols=1)

    # Add bar plot for increasing variations
    fig.add_trace(go.Bar(x=var_up_1['text'], y=var_up_1[f'var cos axe {axis_1}'], name='Increasing'), row=1, col=1)

    # Add bar plot for decreasing variations
    fig.add_trace(go.Bar(x=var_down_2['text'], y=var_down_2[f'var cos axe {axis_2}'], name='Decreasing'), row=2, col=1)

    fig.update_layout(title_text= f"Extreme embedding variation in 2013 on axis {axis_1} and {axis_2}")
    fig.update_layout(autosize=False, width=1000, height=800)

    fig.show()

In [18]:
df_t = all_dfs[6]

In [25]:
vizualize_top_variations(df_t, 1, 1, 'up', 'down')