In [None]:
# en python:
import os
os.environ["https_proxy"] = "http://proxy.fr.cfm.fr:6060"
#from nltk.tokenize import sent_tokenize
from datasets import Dataset
import numpy as np
import pandas as pd
import regex as re
import torch
from sentence_transformers import SentenceTransformer, util
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import json
# read the config file into a dictionary
with open("/mnt/live/user/yzhong/config_transformer.json", "r") as f:
    config_ = json.load(f)

print(config_)

In [None]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r'\(.*?\)', " \'d", string)
    # remove things in parathesesS
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [None]:
#Calculate the length of the sentences so the batches will be patched efficiently
def compute_length(batch, text='headline'):
    return {
        'length': [len(item) for item in batch[text]]
    }
def sentence_embeddings(batch, model,text='translation'):
    with torch.no_grad():
        embeddings = model.encode(batch[text])
    return {'embeddings': embeddings}

In [None]:
MODEL_ID = config_['SENTENCE_TRANSFORMER']
model = SentenceTransformer(MODEL_ID,device='cuda',cache_folder=config_['model_dir'] )
# Use the map method to apply the mapping function to the dataset in batches
batch_size =256

Read data

In [None]:
def sim_dataframe (similarity_matrix,df_e,df_c):
    # Create an empty dataframe to store the three closest neighbours for each sentences
    sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])
    for i, dic in enumerate(similarity_matrix):
        index=i
        top_three=[dic[x]['corpus_id'] for x in range(3)]
        top_three_score=[dic[x]['score'] for x in range(3)]
        # Get the corresponding rows from `df_e`
        print(df_e.columns)
        matches = df_e.loc[df_e['id'].isin(top_three)]
        matches = matches.reindex(index=top_three).reset_index(drop=True)
        # Create a new row to add to `result_df`
        new_row= {'id': index}

        # Loop over the matches and add their headlines and dates to the new row
        for j, match_row in matches.iterrows():
            new_row[f'Sim{j+1}'] = match_row['headline']
            new_row[f'date{j+1}'] = match_row['last_update']
            new_row[f'cos{j+1}'] = top_three_score[j]
        
        # Add the new row to `sim_df`
        sim_df = sim_df.append(new_row, ignore_index=True)
        print(df_c.columns)
        merged_df = pd.merge(df_c, sim_df, on='id')

        return merged_df

In [None]:
def find_match(df1,df2):
    #df1 chinese df2 english
    #df1.dropna(inplace=True)
    #df1.reset_index(drop=True, inplace=True)
    #df1=df1.reset_index().rename(columns={'index': 'id'})
    #print(df1.columns)

    df2.dropna(inplace=True)
    df2.reset_index(drop=True, inplace=True)
    df2=df2.reset_index().rename(columns={'index': 'id'})

    df1.loc[:,"translation"] = df1.translation.apply(lambda x: clean_str(x))
    df2.loc[:,"headline"] = df2.headline.apply(lambda x: clean_str(x))
    ds_t = Dataset.from_dict(df1)
    ds_e = Dataset.from_dict(df2)
    cols_to_remove = ds_t.column_names
    cols_to_remove.remove("translation")
    cols_to_remove.remove("id")
    ds_t1=ds_t.remove_columns(cols_to_remove)
    cols_to_remove = ds_e.column_names
    cols_to_remove.remove("headline")
    cols_to_remove.remove("id")
    ds_e1=ds_e.remove_columns(cols_to_remove)
    ds_t1 = ds_t1.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
    ds_t1 = ds_t1.map(lambda x: sentence_embeddings(x ,model, text='translation'), batched=True, batch_size=batch_size).sort('id')
    ds_e1 = ds_e1.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
    ds_e1 = ds_e1.map(lambda x: sentence_embeddings(x, model,text='headline'), batched=True, batch_size=batch_size).sort('id')
    translated_input_ids=ds_t1['embeddings']
    originated_input_ids=ds_e1['embeddings']
    emb1=torch.tensor(translated_input_ids)
    emb2=torch.tensor(originated_input_ids)
    similarity_matrix=util.semantic_search(emb1, emb2, top_k=3)
    merge_df=sim_dataframe (similarity_matrix,df2,df1)
    return merge_df


In [None]:
import os
import glob
import pandas as pd

# Define the paths to the Chinese and English news data folders
chinese_folder_path = '/mnt/research-live/user/yzhong/Chinese news data'
english_folder_path = '/mnt/research-live/user/yzhong/English_news_data'

# Loop over the Chinese news data folders
for year_folder in os.listdir(chinese_folder_path):
    print(year_folder)
    # Check if the item in the directory is a folder and starts with '20'
    if os.path.isdir(os.path.join(chinese_folder_path, year_folder)):
        # Loop over the files in the Chinese news data folder
        for file_name in os.listdir(os.path.join(chinese_folder_path, year_folder)):
            # Check if the file is a parquet file and contains the string 'bloomberg_news_chinese'
            if file_name.endswith('.parquet') and 'bloomberg_news_chinese' in file_name:
                # Extract the year and month from the file name
                #print(file_name)
                df_c=pd.read_parquet(chinese_folder_path+'/'+year_folder+'/'+file_name)
                year = year_folder
                month = file_name.split('_')[-1].split('.')[0]
                month_str = f"{int(month):02d}"
                # Define the path to the English news data folder for the year and month
                english_folder_year_month = os.path.join(english_folder_path, year, month_str)
                print(english_folder_year_month)
                # # Check if the English news data folder exists for the year and month
                if os.path.exists(english_folder_year_month):
                    # Get a list of all the parquet files in the English news data folder for the year and month
                    file_list = glob.glob(os.path.join(english_folder_year_month, '*.parquet'))
                    # Concatenate all the parquet files into a pandas dataframe
                    df_e = pd.concat([pd.read_parquet(f) for f in file_list], ignore_index=True)
                    print(df_e.columns)
                merge_df=find_match(df_c,df_e)
                # create the file path and name
                file_name = f"/mnt/research-live/user/yzhong/Matching_news_data/bloomberg_news_chinese_matching_{year}_{month_str}.parquet"
                merge_df.to_parquet(file_name)



In [None]:
print('hi')

In [None]:
df_c=pd.read_parquet('/mnt/research-live/user/yzhong/Chinese news data/2011/bloomberg_news_chinese_2011_4.parquet')
import glob
import pandas as pd
file_list = glob.glob('/mnt/research-live/user/yzhong/English_news_data/2011/04/*.parquet')
dfs = []
for file_path in file_list:
    df = pd.read_parquet(file_path)
    dfs.append(df)
df_e = pd.concat(dfs, axis=0)

In [None]:
df_c.loc[:,"translation"] = df_c.translation.apply(lambda x: clean_str(x))
df_e.loc[:,"headline"] = df_e.headline.apply(lambda x: clean_str(x))

In [None]:
df_e.dropna(inplace=True)
df_e.reset_index(drop=True, inplace=True)
df_e=df_e.reset_index().rename(columns={'index': 'id'})

In [None]:
ds_t = Dataset.from_dict(df_c)
ds_e = Dataset.from_dict(df_e)
ne=len(ds_e['headline'])
nt=len(ds_t['translation'])
print (f'English news: {ne}')
print (f'Chinese news: {nt}')

In [None]:
cols_to_remove = ds_t.column_names
cols_to_remove.remove("translation")
cols_to_remove.remove("id")
ds_t1=ds_t.remove_columns(cols_to_remove)
ds_t1
cols_to_remove = ds_e.column_names
cols_to_remove.remove("headline")
cols_to_remove.remove("id")
ds_e1=ds_e.remove_columns(cols_to_remove)
ds_e1

In [None]:
ds_t1 = ds_t1.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
ds_t1 = ds_t1.map(lambda x: sentence_embeddings(x ,model, text='translation'), batched=True, batch_size=batch_size).sort('id')
ds_e1 = ds_e1.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
ds_e1 = ds_e1.map(lambda x: sentence_embeddings(x, model,text='headline'), batched=True, batch_size=batch_size).sort('id')

In [None]:
translated_input_ids=ds_t1['embeddings']
originated_input_ids=ds_e1['embeddings']
emb1=torch.tensor(translated_input_ids)
emb2=torch.tensor(originated_input_ids)
similarity_matrix=util.semantic_search(emb1, emb2, top_k=3)

In [None]:
similarity_matrix

In [None]:
# Create an empty dataframe to store the three closest neighbours for each sentences
sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])
for i, dic in enumerate(similarity_matrix):
    index=i
    top_three=[dic[x]['corpus_id'] for x in range(3)]
    top_three_score=[dic[x]['score'] for x in range(3)]
    # Get the corresponding rows from `df_e`
    matches = df_e.loc[df_e['id'].isin(top_three)]
    matches = matches.reindex(index=top_three).reset_index(drop=True)
    # Create a new row to add to `result_df`
    new_row= {'id': index}

    # Loop over the matches and add their headlines and dates to the new row
    for j, match_row in matches.iterrows():
        new_row[f'Sim{j+1}'] = match_row['headline']
        new_row[f'date{j+1}'] = match_row['last_update']
        new_row[f'cos{j+1}'] = top_three_score[j]
    
    # Add the new row to `sim_df`
    sim_df = sim_df.append(new_row, ignore_index=True)
    

In [None]:
sim_df 

In [None]:
merged_df = pd.merge(df_c, sim_df, on='id')

In [None]:
# Create an empty dataframe to store the three closest neighbours for each sentences
sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])

In [None]:

# Loop over the rows of the `top_three_indices` array
for i, top_three in enumerate(top_three_indices):
    # Get the index of the current row
    index = i
    # Get the corresponding rows from `df_e`
    matches = df_e.loc[df_e['id'].isin(top_three)]
    matches = matches.reindex(index=top_three).reset_index(drop=True)
    
    # Create a new row to add to `result_df`
    new_row= {'id': index}
    
    # Loop over the matches and add their headlines and dates to the new row
    for j, match_row in matches.iterrows():
        new_row[f'Sim{j+1}'] = match_row['headline']
        new_row[f'date{j+1}'] = match_row['last_update']
        new_row[f'cos{j+1}'] = cosine_similarities[i][top_three[j]]
    
    # Add the new row to `sim_df`
    sim_df = sim_df.append(new_row, ignore_index=True)

In [None]:
len(similarity_matrix[0])

In [None]:
# Save the matrix as a .npy file
np.save("cosine_similarities.npy", similarity_matrix)
# Save the matrix as a compressed .npz file
np.savez_compressed("cosine_similarities.npz", similarity_matrix )

In [None]:
import os
import pandas as pd

# Create the folder to store the monthly data
os.makedirs("/mnt/live/user/yzhong/Chinese news data", exist_ok=True)

# Read the data
df = pd.read_parquet("/mnt/live/user/yzhong/bloomberg_news_chinese.parquet")

# Extract year and month from the 'date' column
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month


for year in df['year'].unique():
    year_folder_path = os.path.join("/mnt/live/user/yzhong/Chinese news data", str(year))
    os.makedirs(year_folder_path, exist_ok=True)
    for month in df.loc[df['year'] == year, 'month'].unique():
        month_df = df.loc[(df['year'] == year) & (df['month'] == month)]
        del month_df['year']
        del month_df['month']
        filename = f"bloomberg_news_chinese_{year}_{month}.parquet"
        filepath = os.path.join(year_folder_path, filename)
        month_df.to_parquet(filepath)


In [None]:
df_c.shape

In [None]:
df_e.shape