<a href="https://colab.research.google.com/github/ayusinelnik/narratives-at-conflict/blob/main/06_Keywords_Pairs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up

In [None]:
import csv
import pandas as pd
import numpy as np
import re

from google.colab import drive
drive.mount('/content/drive/', force_remount = True)

Mounted at /content/drive/


# Dataset Read

# True multiling pairs

In [None]:
# this excel is the manually checked/cleaned version

test = pd.read_excel('/content/drive/MyDrive/Research_Thesis_Bocconi_2023/multiling_true_pairs.xlsx')

In [None]:
# this is the same as above
test = pd.read_json('/content/drive/MyDrive/Research_Thesis_Bocconi_2023/train_test/data_test_4_lang.json', lines=True, orient = 'records')

# Remove Test from Train

In [None]:
# reduces data to train set (= there are no true pairs here)
values_to_remove = test['original_article_link'].tolist()
data = data[~data['original_article_link'].isin(values_to_remove)]


In [None]:
data['lang_auto'].value_counts()

ru    7786
es     659
it     460
fr     406
Name: lang_auto, dtype: int64

## Truncate Texts

In [None]:
from math import ceil

def truncate_text(text, max_words=225):
    words = text.split()
    truncated_text = ' '.join(words[:max_words])
    return truncated_text

# Assuming 'data' is your DataFrame
test['truncated_text'] = test['text'].apply(lambda x: truncate_text(x, max_words=225))

# Generate Multilingual Pairs for Test Set

In [None]:
from itertools import combinations

all_pairs_df = pd.DataFrame(columns=['original_index', 'text_1', 'lang_1', 'text_2', 'lang_2'])


for index in test['original_index'].unique():

    subset_df = test[test['original_index'] == index]


    for pair in combinations(subset_df.index, 2):
        text_1, lang_1 = subset_df.loc[pair[0], ['truncated_text', 'lang_auto']]
        text_2, lang_2 = subset_df.loc[pair[1], ['truncated_text', 'lang_auto']]

        all_pairs_df = all_pairs_df.append({'original_index': index,
                                            'text_1': text_1, 'lang_1': lang_1,
                                            'text_2': text_2, 'lang_2': lang_2}, ignore_index=True)

# Filter out pairs where the languages are the same
test_pairs = all_pairs_df[all_pairs_df['lang_1'] != all_pairs_df['lang_2']]


In [None]:
test_pairs

# Keywords and Embeddings Imports

In [None]:
!pip install yake
import yake

In [None]:
!pip install "tensorflow-text==2.11.*"
!pip install bokeh
!pip install simpleneighbors[annoy]
!pip install tqdm

In [None]:
!pip install --upgrade tensorflow-text

In [None]:
#@title Setup common imports and functions

import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange

def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         x_axis_label='Embeddings 1', y_axis_label='Embeddings 2',
                         width=1000, height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            width=width, height=height,
                            tools="save", toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim{0.2f}')], x_axis_label=x_axis_label, y_axis_label=y_axis_label)

  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.title.align = 'center'
  p.title.text_font_style = 'bold'

  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  color_bar = bokeh.models.ColorBar(color_mapper=mapper, location=(0, 0), title='Cosine Similarity')
  p.add_layout(color_bar, 'right')

  bokeh.io.output_notebook()
  bokeh.io.show(p)
  return p

In [None]:

module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

model = hub.load(module_url)

def embed_text(input):
  return model(input)

# Keywords Similarity: Produce Results on Hyperparsmeters

Long execution time (about 1h on CPU)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from itertools import product
import yake

max_ngram_sizes = [1, 3, 5, 10]
deduplication_thresholds = [0, 0.2, 0.5, 0.9]
numOfKeywords_values = [5, 10, 15, 20]
normalized_score_thresholds = [0.2, 0.5, 0.7, 0.9]


language1 = "es"
language2 = "it"

def calculate_similarity(text1, text2, max_ngram_size, deduplication_threshold, numOfKeywords, normalized_score_threshold):
    deduplication_algo = 'seqclst'

    kw_extractor1 = yake.KeywordExtractor(lan=language1, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, top=numOfKeywords, n=max_ngram_size, features=None)
    keywords1 = kw_extractor1.extract_keywords(text1)

    kw_extractor2 = yake.KeywordExtractor(lan=language2, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, top=numOfKeywords, n=max_ngram_size, features=None)
    keywords2 = kw_extractor2.extract_keywords(text2)

    scores1 = [score for _, score in keywords1]
    scaler1 = MinMaxScaler()
    normalized_scores1 = 1 - scaler1.fit_transform([[score] for score in scores1])
    normalized_scores1 = normalized_scores1.flatten().tolist()
    normalized_keyword_scores1 = [(keyword, score, normalized_score) for (keyword, score), normalized_score in zip(keywords1, normalized_scores1)]
    filtered_normalized_keyword_scores1 = [(keyword, score, normalized_score) for keyword, score, normalized_score in normalized_keyword_scores1 if normalized_score > normalized_score_threshold]

    scores2 = [score for _, score in keywords2]
    scaler2 = MinMaxScaler()
    normalized_scores2 = 1 - scaler2.fit_transform([[score] for score in scores2])
    normalized_scores2 = normalized_scores2.flatten().tolist()
    normalized_keyword_scores2 = [(keyword, score, normalized_score) for (keyword, score), normalized_score in zip(keywords2, normalized_scores2)]
    filtered_normalized_keyword_scores2 = [(keyword, score, normalized_score) for keyword, score, normalized_score in normalized_keyword_scores2 if normalized_score > normalized_score_threshold]

    keyword_list1 = [item[0] for item in filtered_normalized_keyword_scores1]
    keyword_list2 = [item[0] for item in filtered_normalized_keyword_scores2]


    result1 = embed_text(keyword_list1)
    result2 = embed_text(keyword_list2)

    # Calculate cosine similarity between the two language results
    similarity_matrix = cosine_similarity(result1, result2)
    overall_similarity = np.mean(similarity_matrix)

    return overall_similarity

pair_results = []


for max_ngram_size, deduplication_threshold, numOfKeywords, normalized_score_threshold in product(max_ngram_sizes, deduplication_thresholds, numOfKeywords_values, normalized_score_thresholds):
    pair_scores = []

    for index, row in test_pairs_es.iterrows():
        text1, text2 = row['text_1'], row['text_2']
        similarity = calculate_similarity(text1, text2, max_ngram_size, deduplication_threshold, numOfKeywords, normalized_score_threshold)
        pair_scores.append(similarity)


    mean_similarity = np.mean(pair_scores)

    pair_results.append({
        'max_ngram_size': max_ngram_size,
        'deduplication_threshold': deduplication_threshold,
        'numOfKeywords': numOfKeywords,
        'normalized_score_threshold': normalized_score_threshold,
        'mean_similarity': mean_similarity
    })

pair_results = pd.DataFrame(pair_results)
pair_results = pair_results.sort_values(by='mean_similarity', ascending=False)


## Find the best common param combination and each lang pair best params

In [None]:
import pandas as pd
import glob


path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/'
filename_pattern = 'param_results_alltxt_*.json'
file_paths = glob.glob(path + filename_pattern)
dataframes = []


for file_path in file_paths:
    df = pd.read_json(file_path, orient='records', lines=True)
    dataframes.append(df)


for df in dataframes:
    df.reset_index(drop=True, inplace=True)


for df in dataframes:
    columns_to_exclude = ['mean_similarity'] #add normalized_score_threshold if needed
    df['identifier'] = df.drop(columns=columns_to_exclude).astype(str).agg('-'.join, axis=1)


common_identifiers = set(df['identifier'].iloc[0] for df in dataframes[1:]).intersection(dataframes[0]['identifier'])


best_rows, highest_mean_similarities = [], []

for identifier in common_identifiers:
    rows = [df[df['identifier'] == identifier] for df in dataframes]

    mean_similarities = [min(row['mean_similarity'].values[0] for row in rows) for row in rows]

    if max(mean_similarities) > 0:
        highest_mean_similarities.append(max(mean_similarities))
        best_rows.append((identifier, rows))
    else:
        print(f"Skipping identifier {identifier} as all mean similarities are non-positive.")
        print(f"Mean Similarities: {mean_similarities}")


sorted_indices = sorted(range(len(highest_mean_similarities)), key=lambda k: highest_mean_similarities[k], reverse=True)


for i in range(min(5, len(sorted_indices))):
    index = sorted_indices[i]
    identifier, rows = best_rows[index]

    print(f"\nTop {i + 1} Row Identifier: {identifier}")
    print(f"Highest Mean Similarity: {highest_mean_similarities[index]}")


    for j, row in enumerate(rows):
        file_name = file_paths[j].split('/')[-1]  # Extract file name from the path
        print(f"Similarity in {file_name}: {row['mean_similarity'].values[0]}")
        print(f"Index in {file_name}: {row.index[0]}")


# Assign Keywords by Common Best Hyperparameters

In [None]:
from math import ceil
import pandas as pd

def truncate_text(text, max_words=225):
    if isinstance(text, str):
        words = text.split()
        truncated_text = ' '.join(words[:max_words])
        return truncated_text
    else:
        return str(text)

data['truncated_text'] = data['text'].apply(lambda x: truncate_text(x, max_words=225))

In [None]:
data = data.dropna(subset=['text']).reset_index(drop=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import yake


deduplication_algo = 'seqclst'
max_ngram_size = 1
deduplication_threshold = 0.5
numOfKeywords = 15


def extract_keywords_and_normalize(text, language):
    kw_extractor = yake.KeywordExtractor(lan=language, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, top=numOfKeywords, n=max_ngram_size, features=None)
    keywords = kw_extractor.extract_keywords(text)

    if not keywords:
      return {}

    scores = [score for _, score in keywords]
    scaler = MinMaxScaler()


    scores_array = np.array(scores).reshape(-1, 1)

    normalized_scores = 1 - scaler.fit_transform(scores_array)
    normalized_scores = normalized_scores.flatten().tolist()
    keyword_dict = {keyword: normalized_score for (keyword, _), normalized_score in zip(keywords, normalized_scores)}

    return keyword_dict

data['keywords_auto'] = data.apply(lambda row: extract_keywords_and_normalize(row['truncated_text'], row['lang_auto']), axis=1)


In [None]:
empty_list_mask = data['keywords_auto'].apply(lambda x: len(x) == 0)
data = data[~empty_list_mask]
data.reset_index(drop=True, inplace=True)

Repear for fewer than 5 keywords

In [None]:
empty_list_mask = data['keywords_auto'].apply(lambda x: len(x) < 5)
data = data[~empty_list_mask]
data.reset_index(drop=True, inplace=True)

Filters out Stopwords in Keywords

In [None]:
import pandas as pd
import os

stoplist_files = {
    'ru': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_ru.txt',
    'es': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_es.txt',
    #'en': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_en.txt',
    #'de': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_de.txt',
    'it': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_it.txt',
    'fr': '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/stop_words/stopwords_fr.txt'
}

stoplists = {}
for lang, stoplist_file in stoplist_files.items():
    with open(stoplist_file, 'r', encoding='utf-8') as file:
        stoplists[lang] = set(line.strip() for line in file)


def remove_stopwords(keywords_dict, language):
    if not isinstance(keywords_dict, dict):
        return {}

    cleaned_keywords_dict = {}
    for keyword, score in keywords_dict.items():
        if keyword.lower() not in stoplists[language]:
            cleaned_keywords_dict[keyword] = score

    return cleaned_keywords_dict

data['keywords_auto'] = data.apply(lambda row: remove_stopwords(row['keywords_auto'], row['lang_auto']), axis=1)

# Filter Keywords

Filter out to leave only 5 higest score words each rows

In [None]:
def keep_top_keys(keywords_dict):
    if not isinstance(keywords_dict, dict):
        return {}

    top_5_keywords_dict = {k: keywords_dict[k] for k in list(keywords_dict)[:5]}

    return top_5_keywords_dict

data['keywords_auto'] = data['keywords_auto'].apply(keep_top_keys)

# Text Pairs by New Keywords

Transform the dicts into lists, remove the scores

## Find Pairs

In [None]:
# Quick fix if the dates appear in strange long numbers (miliseconds)

import datetime

def convert_timestamp_to_date(timestamp):
    return datetime.datetime.utcfromtimestamp(timestamp / 1000).strftime('%d/%m/%Y')

data['Date_Of_Publication'] = data['Date_Of_Publication'].apply(convert_timestamp_to_date)


In [None]:
# Quick manual fix for dates == 0. The keywords and other html columns are empty too, should come back and check/fix

row_with_numeric_zero = data[data['Date_Of_Publication'] == 0]
data['Date_Of_Publication'] = data['Date_Of_Publication'].astype(str)
data = data.replace({'Date_Of_Publication': {'0': '10/07/2020'}})


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import datetime

data['Date_Of_Publication'] = pd.to_datetime(data['Date_Of_Publication'], format='%d/%m/%Y')


time_window = datetime.timedelta(weeks=4) # +- 4 weeks


result_df = pd.DataFrame(columns=['ru_text', 'fr_text', 'ru_keywords', 'fr_keywords', 'similarity'])

# the code loops through the french articles

for fr_index, fr_row in data[data['lang_auto'] == 'fr'].iterrows(): # Check the language ###################
    fr_date = fr_row['Date_Of_Publication']
    fr_keywords = fr_row['keywords_auto']

    # check russian matches
    ru_candidates = data[(data['lang_auto'] == 'ru') & (data['Date_Of_Publication'] >= fr_date - time_window) & (data['Date_Of_Publication'] <= fr_date + time_window)]


    if not ru_candidates.empty:
        best_similarity = -1
        best_ru_row = None

        for ru_index, ru_row in ru_candidates.iterrows():
            ru_keywords = ru_row['keywords_auto']


            fr_result = embed_text(fr_keywords) # The algorithm from MUSE
            ru_result = embed_text(ru_keywords)
            similarity_matrix = cosine_similarity(fr_result, ru_result)
            overall_similarity = np.mean(similarity_matrix)


            if overall_similarity > best_similarity: # Update best similarity
                best_similarity = overall_similarity
                best_ru_row = ru_row


        if best_ru_row is not None:
            result_df = result_df.append({
                'ru_text': best_ru_row['text'],
                'fr_text': fr_row['text'],
                'ru_keywords': best_ru_row['keywords_auto'],
                'fr_keywords': fr_keywords,
                'similarity': best_similarity
            }, ignore_index=True)


# Visualize one Example of Keyword Matching

In [None]:

from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


deduplication_algo = 'seqclst'
max_ngram_size = 1
deduplication_threshold = 0.9
numOfKeywords = 15
language = "fr"

text_sp = data['fr_truncated_text'][57]

kw_extractor = yake.KeywordExtractor(lan = language, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, top=numOfKeywords,n=max_ngram_size, features=None) #

keywords_1 = kw_extractor.extract_keywords(text_sp)

scores = [score for _, score in keywords_1]
scaler = MinMaxScaler()

normalized_scores = 1 - scaler.fit_transform([[score] for score in scores])
normalized_scores = normalized_scores.flatten().tolist()
normalized_keyword_scores_1 = [(keyword, score, normalized_score) for (keyword, score), normalized_score in zip(keywords_1, normalized_scores)]
filtered_normalized_keyword_scores_1 = [(keyword, score, normalized_score) for keyword, score, normalized_score in normalized_keyword_scores_1 if normalized_score > 0.0]


deduplication_algo = 'seqclst'
max_ngram_size = 1
deduplication_threshold = 0.9
numOfKeywords = 15
language = "es"

text_ru = data['es_truncated_text'][57]

kw_extractor = yake.KeywordExtractor(lan = language, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, top=numOfKeywords, n=max_ngram_size, features=None) #
keywords_2 = kw_extractor.extract_keywords(text_ru)

from sklearn.preprocessing import MinMaxScaler
scores = [score for _, score in keywords_2]
scaler = MinMaxScaler()

normalized_scores = 1 - scaler.fit_transform([[score] for score in scores])
normalized_scores = normalized_scores.flatten().tolist()
normalized_keyword_scores_2 = [(keyword, score, normalized_score) for (keyword, score), normalized_score in zip(keywords_2, normalized_scores)]
filtered_normalized_keyword_scores_2 = [(keyword, score, normalized_score) for keyword, score, normalized_score in normalized_keyword_scores_2 if normalized_score > 0.0]


filtered_normalized_keyword_scores_1
filtered_normalized_keyword_scores_2

spanish_keywords = [item[0] for item in filtered_normalized_keyword_scores_1[:5]]
russian_keywords = [item[0] for item in filtered_normalized_keyword_scores_2[:5]]

sp_result = embed_text(spanish_keywords)
ru_result = embed_text(russian_keywords)

similarity_matrix = cosine_similarity(sp_result, ru_result)
overall_similarity = np.mean(similarity_matrix)

#print(f"Overall Similarity: {overall_similarity}")

In [None]:
figure = visualize_similarity(sp_result, ru_result, spanish_keywords, russian_keywords, 'French to Spanish Keyword Similarity Matrix', x_axis_label='French Keywords', y_axis_label='Spanish Keywords', width=800, height=400,
                         xaxis_font_size='10pt', yaxis_font_size='10pt')