## Functions and libraries

In [1]:
# en python:
import os
os.environ["https_proxy"] = "http://proxy.fr.cfm.fr:6060"
import sys
# adding src to the system path
sys.path.insert(0, '/home/yzhong/notebooks/getting_started')
import yaml
from src.extract_chinese_data import get_bloomberg_chinese_news, get_bloomberg_english_news
from datasets import Dataset
import numpy as np
import pandas as pd
import regex as re
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt
import faiss
from laserembeddings import Laser

Picked up JAVA_TOOL_OPTIONS: -Djava.io.tmpdir=/opt/tmp/tmp
Picked up JAVA_TOOL_OPTIONS: -Djava.io.tmpdir=/opt/tmp/tmp
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/27 07:58:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/27 07:58:41 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).




In [2]:
DATA_PATH = "/mnt/research-live/user/yzhong/" 

In [2]:
#remove space
def remove_space(text):
    text=text.replace(' ','')
    return text
#Full-width to half-width
def full_to_half(sentence):      
    change_sentence=""
    for word in sentence:
        inside_code=ord(word)
        if inside_code==12288:    #Direct conversion of full-width spaces
            inside_code=32
        elif inside_code>=65281 and inside_code<=65374:  #Full-width characters (except spaces) are converted according to the relationship
            inside_code-=65248
        change_sentence+=chr(inside_code)
    return change_sentence
# remove unnecessary symbols as well as things in paratheses     
def remove_special_chars(text):
    special_chars = r'[&*+\-\/<=>?@\^_|~]'
    # replace special characters with an empty string
    cleaned_text = re.sub(special_chars, '', text)
    pattern = r'\(.*?\)' # remove things in paratheses
    cleaned_text = re.sub(pattern, '', cleaned_text)
    return cleaned_text    
def add_chinese_period (string):
    """
    For stablized the translation
    """
    string=string+'。'
    return string
# Tokenize a sentence
def clean_str_english(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
     # remove things in parathese
    string = re.sub(r'\(.*?\)', " \'d", string)
    string = re.sub(r'[&*+\-\/<=>?@\^_|~]', '', string)
    
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [3]:
def cosinus_similarity (emb1, emb2):
# cosine similarity = normalize the vectors & multiply
    C = F.normalize(torch.tensor(emb1)) @ F.normalize(torch.tensor(emb2)).t()
    return C.numpy()

In [4]:
def cosine_similarity(a, b=None): 
    """ 
    cosine_similarity: Compute cosine similarity between two arrays. 
    If second array is None, computes similarity with self. 
    Args: a (np.array): dimension (n_a, p) b (np.array): dimension (n_b, p)
    """ 
    a /= np.sqrt(np.diag(a @ a.T))[:, None]
    if b is None: 
        b = a 
    else: 
        b /= np.sqrt(np.diag(b @ b.T))[:, None] 
    return a @ b.T

In [5]:
import json
# read the config file into a dictionary
with open("/home/yzhong/notebooks/getting_started/config_transformer.json", "r") as f:
    config_ = json.load(f)

print(config_)

{'TRANSLATION_MODEL': 'Helsinki-NLP/opus-mt-zh-en', 'SENTENCE_TRANSFORMER': 'all-MiniLM-L6-v2', 'TRANSLATION_MODEL_ADVANCE': 'alirezamsh/small100', 'model_dir': '/opt/tmp/yzhong/models/'}


## Prerequests: Read English and Chinese news from database

This notebooks requires a folder structure as this: 
- English_news_data/
  - 2011/
    - 01/
      - bloomberg_news_english_2011_01_01.parquet
      - bloomberg_news_english_2011_01_02.parquet
      - ...
    - 02/
      - bloomberg_news_english_2011_02_01.parquet
      - bloomberg_news_english_2011_02_02.parquet
      - ...
    - ...
      - ...
  - 2022/
    - 01/
      - bloomberg_news_english_2022_01_01.parquet
      - bloomberg_news_english_2022_01_02.parquet
      - ...
    - 02/
      - bloomberg_news_english_2022_02_01.parquet
      - bloomberg_news_english_2022_02_02.parquet
      - ...

Code for writing all the bloomberg English news into folder: 

In [None]:
# Create the top-level directory for the English news data
if not os.path.exists("/mnt/live/user/yzhong/English_news_data"):
    os.mkdir("/mnt/live/user/yzhong/English_news_data")

for year in range(2011, 2023):
    year_str = str(year)

    # Create the year directory
    year_dir = os.path.join("/mnt/live/user/yzhong/English_news_data", year_str)
    if not os.path.exists(year_dir):
        os.mkdir(year_dir)

    for month in range(1, 13):
        month_str = f"{month:02d}"

        # Create the month directory
        month_dir = os.path.join(year_dir, month_str)
        if not os.path.exists(month_dir):
            os.mkdir(month_dir)

        # Determine the number of days in the month
        if month in [1, 3, 5, 7, 8, 10, 12]:
            num_days = 31
        elif month == 2:
            if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
                num_days = 29
            else:
                num_days = 28
        else:
            num_days = 30

        for day in range(1, num_days+1):
            day_str = f"{day:02d}"

            # Construct the date string in the required format
            date_str = f"{year_str}/{month_str}/{day_str}"

            # Load the data for the current date
            df_b = get_bloomberg_english_news(date=date_str)

            # Save the data to a parquet file in the appropriate directory
            filename = f'bloomberg_news_english_{year_str}_{month_str}_{day_str}.parquet'
            filepath = os.path.join(month_dir, filename)
            df_b.to_parquet(filepath)

For the Chinese news, it was possible to write them all at once, so an extra seperation job was done. 

In [None]:
# get the Chinese dataframe with trickers. 
#set the language to Chinese_simplified 
df_b = get_bloomberg_chinese_news(date="*/*/*")
df_b.to_parquet('/mnt/research-live/user/yzhong/bloomberg_news_chinese_trickers.parquet')

## Chinese news data

### Translation

Model: 

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu" #cuda:0 indique que tu veux être sur le 1er GPU, cuda:1 le 2e
MODEL_ID = config_['TRANSLATION_MODEL']
tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, use_fast=True, cache_dir=config_['model_dir'] )
model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_ID, cache_dir=config_['model_dir'] ).to(device)

Prepare the dataset

In [None]:
df = pd.read_parquet("/mnt/research-live/user/yzhong/bloomberg_news_chinese.parquet")
df_c=df[df.language=='CHINESE_SIMP'] #prepare the dataframe for Chinese
df_c['headline_org']=df_c.loc[:,"headline"]
df_c.dropna(inplace=True)
df_c.reset_index(drop=True, inplace=True)
df_c=df_c.reset_index().rename(columns={'index': 'id'})
df_c=Process_clean(df_c, lg='zh')

Run the model 

In [None]:
batch_size : int = 32
#Prepare dataset
ds = Dataset.from_dict(df_c)
ds = ds.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
ds = ds.map(lambda x: translate(x, tokenizer, model, text='headline'), batched=True, batch_size=batch_size).sort('id')

Stock the dataset with translation

In [None]:
ds.set_format("pandas")
df_c_t = ds[:]
df_c_t.to_parquet('/mnt/live/user/yzhong/bloomberg_news_chinese_translation.parquet')

### Embeddings

Prepare the dataset

In [None]:
df_c=pd.read_parquet('/mnt/research-live/user/yzhong/bloomberg_news_chinese_trickers_translation.parquet')
df_c.dropna(subset=['headline'])
df_c.reset_index(drop=True, inplace=True)
df_c=df_c.reset_index().rename(columns={'index': 'id'})
df_c.loc[:,"translation"] = df_c.translation.apply(lambda x: clean_str(x))
ds_t = Dataset.from_dict(df_c)
cols_to_remove = ds_t.column_names
cols_to_remove.remove("translation")
cols_to_remove.remove("id")
ds_t1=ds_t.remove_columns(cols_to_remove)

Prepare the model 

In [None]:
MODEL_ID = config_['SENTENCE_TRANSFORMER']
model = SentenceTransformer(MODEL_ID,device='cuda',cache_folder=config_['model_dir'] )
# Use the map method to apply the mapping function to the dataset in batches
batch_size =256*4

Obtain the embeddings

In [None]:
ds_t1 = ds_t1.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
ds_t1 = ds_t1.map(lambda x: sentence_embeddings(x ,model, text='translation'), batched=True, batch_size=batch_size).sort('id')

In [None]:
translated_input_ids=ds_t1['embeddings']
df_c.to_parquet('/mnt/research-live/user/yzhong/bloomberg_news_chinese_trickers_translation_emb.parquet')

Write data into structured folders. 

This part of code allows to partition dataframe into daily frequency and have a same structure as English file.

In [None]:
# Create the folder to store the monthly data
os.makedirs("/mnt/live/user/yzhong/Chinese_news_data_t", exist_ok=True)

# Read the data
df = pd.read_parquet("/mnt/research-live/user/yzhong/bloomberg_news_chinese_trickers_translation_emb.parquet")

# Extract year and month from the 'date' column
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day


for year in df['year'].unique():
    year_folder_path = os.path.join("/mnt/live/user/yzhong/Chinese_news_data_t", str(year))
    os.makedirs(year_folder_path, exist_ok=True)
    for month in df.loc[df['year'] == year, 'month'].unique():
        month_folder_path = os.path.join("/mnt/live/user/yzhong/Chinese_news_data_t",str(year), str(month))
        os.makedirs(month_folder_path, exist_ok=True)
        #print(month_folder_path)
        for day in df.loc[(df['year'] == year)&(df['month'] == month), 'day'].unique():
            #print(day)
            day_df = df.loc[(df['year'] == year) & (df['month'] == month)&(df['day'] == day)]
            del day_df['year']
            del day_df['month']
            del day_df['day']
            filename = f"bloomberg_news_chinese_{str(year)}_{str(month)}_{str(day)}.parquet"
            filepath = os.path.join(month_folder_path, filename)
            day_df.to_parquet(filepath)

## English news data

### Obtain the embeddings for all the English data

In [None]:

folder_path = "/mnt/live/user/yzhong/English_news_data"
all_files = []
for year_folder in os.listdir(folder_path):
    year_path = os.path.join(folder_path, year_folder)
    if os.path.isdir(year_path):
        for month_folder in os.listdir(year_path):
            month_path = os.path.join(year_path, month_folder)
            if os.path.isdir(month_path):
                for file_name in os.listdir(month_path):
                    if file_name.endswith(".parquet"):
                        file_path = os.path.join(month_path, file_name)
                        df = pd.read_parquet(file_path)
                        df=df.reset_index().rename(columns={'index': 'id'})
                        df.loc[:,"headline"] = df.headline.apply(lambda x: clean_str(x))
                        ds = Dataset.from_dict(df)
                        cols_to_remove = ds.column_names
                        cols_to_remove.remove("headline")
                        cols_to_remove.remove("id")
                        ds_e=ds.remove_columns(cols_to_remove)
                        ds_e = ds_e.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
                        ds_e = ds_e.map(lambda x: sentence_embeddings(x, model,text='headline'), batched=True, batch_size=batch_size).sort('id')
                        headline_input_ids=ds_e['embeddings']
                        df['embedding'] = headline_input_ids
                        df.to_parquet(file_path)


## Matching

In [6]:
def get_bloomberg_news(lang, date):
    '''
    Get Bloomberg news data, dropping rows with null ticker and headline values.
    Add a column 'id' for further resorting the data 

    Parameters:
    lang (str): The language of the news articles. Either 'English' or 'Chinese'.
    date (str): The date of the news articles in the format 'YYYY/MM/DD'.
    Returns:
    bloomberg news datast English/Chinese simplified
    ''' 
    # Load the news data for the specified language and date, drop rows with null ticker values, and reset the index
    if lang=='English':
        df = get_bloomberg_english_news(date).dropna(subset=['headline', 'tickers']).reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
    elif lang=='Chinese':
        df = get_bloomberg_chinese_news(date).dropna(subset=['headline', 'tickers']).reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
    else:
        raise ValueError("Invalid language. Must be either 'English' or 'Chinese'.")
    # Save the data to file in parquet format
    return df

In [107]:
with open('/mnt/research-live/user/yzhong/english_titles_0417.txt', 'w') as f:
    for row in df_e.itertuples():
        f.write(str(row.headline) + '\n')
        f.write(str(row.tickers) + '\n')

In [7]:
def preprocess_chinese_headline(string):
    string = add_chinese_period(string)
    string = remove_special_chars(string)
    string = remove_space(string)
    string=full_to_half(string)
    return string

In [8]:
def laser_cosine(chinese_sentences,english_sentences,laser):
    chinese_emb=laser.embed_sentences(chinese_sentences,lang='zh')
    english_emb=laser.embed_sentences(english_sentences,lang='en')
    c=cosinus_similarity(chinese_emb, english_emb)
    return c

In [9]:
# for old version of numpy 
np.object = object    
np.bool = bool  
np.float = float  
np.int = int    
np.typeDict =np.sctypeDict

In [10]:
def prepare_translation_dataset(df,model, lg='chinese'):
    ds = model.prepare_dataset(df, task_type='translation', lg=lg)
    return ds
def prepare_embedding_dataset(df,model, lg):
    ds = model.prepare_dataset(df, task_type='sentence_embedding', lg=lg)
    return ds
def translation(df_c, model,device,batch_size):
    ds = prepare_translation_dataset(df_c,model)
    ds = ds.map(lambda x: model.translate(x,device,text='headline'), batched=True, batch_size=batch_size).sort('id')  # return new dataset
    return ds['translation']
def sentence_embeddings(ds,model, batch_size,lg):
    if lg !='english':  
        ds = ds.map(lambda x: model.sentence_embeddings(x, text='translation'), batched=True, batch_size=batch_size).sort('id')
    else :  
        ds = ds.map(lambda x: model.sentence_embeddings(x, text='headline'), batched=True, batch_size=batch_size).sort('id')
    return ds['embeddings']

def translation_embedding(df_c, df_e,model,device, batch_size_translate, batch_size_embeddings):
    ds_c = prepare_translation_dataset(df_c,model)
    translations = translation(ds_c, model,device,batch_size_translate)
    df_c['translation'] = translations
    ds_sc = prepare_embedding_dataset(df_c,model,lg='chinese')
    ds_se = prepare_embedding_dataset(df_e,model,lg='english')
    chinese_emb = sentence_embeddings(ds_sc,model, batch_size_embeddings,lg='chinese')
    df_c['embedding'] = chinese_emb
    english_emb = sentence_embeddings(ds_se, model,batch_size_embeddings,lg='english')
    df_e['embedding'] = english_emb
    return df_c, df_e


def translation_cosine(chinese_emb, english_emb):
    c = F.normalize(torch.tensor(chinese_emb).unsqueeze(0)) @ F.normalize(torch.tensor(english_emb)).t()
    return c.numpy()

In [11]:
class TranslationSentenceEmbeddingPipeline:
    """
    A class for performing translation and sentence embedding tasks using pretrained models.
    """
    CACHE_DIR = config_['model_dir']  # constant for cache directory
    MAX_LENGTH = 512 # constant for maximum sequence length
    def __init__(
        self,
        translation_model_name: str,
        sentence_embedding_model_name: str,
        device 
    ) -> None:
        """
        Initialize the TranslationSentenceEmbeddingPipeline class.

        Args:
            translation_model_name: The name of the pretrained translation model to use.
            sentence_embedding_model_name: The name of the pretrained sentence embedding model to use.
            device: The device to use for computations (e.g. 'cpu' or 'cuda').
        """
        self.translation_model = AutoModelForSeq2SeqLM.from_pretrained(
            translation_model_name,
            cache_dir=self.CACHE_DIR
        ).to(device)
        self.translation_tokenizer = AutoTokenizer.from_pretrained(
            translation_model_name,
            use_fast=True,
            cache_dir=self.CACHE_DIR
        )
        self.sentence_embedding_model = SentenceTransformer(
            sentence_embedding_model_name,
            device=device,
            cache_folder=self.CACHE_DIR
        )

    def compute_length(
        self,
        batch,
        text: str = 'headline'
    ) -> dict:
        """
        Compute the length of each sentence in a batch.
        """
        return {'length': [len(item) for item in batch[text]]}

    def prepare_dataset(
        self,
        df,
        task_type: str,
        lg: str 
    ) -> Dataset:
        """
        Prepare the input data for a translation or sentence embedding task.
        Args:
            df: A pandas DataFrame containing the input data.
            task_type: The type of task to perform ('translation' or 'sentence_embedding').
            lg: The language of the input text (default is 'chinese').
        Returns:
            A Hugging Face Dataset object containing the preprocessed input data.
        """
        if task_type == 'translation':
            # Implementation for preparing translation dataset
            ds = Dataset.from_dict({
                "id": df["id"],
                "headline": df["headline"]
               # ,"length": df["headline"].str.len()
                
            })
            ds = ds.map(lambda x: {
                'headline': preprocess_chinese_headline(x['headline']),
                'id': x['id']
            })
            ds = ds.map(lambda x: self.compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
           
        elif task_type == 'sentence_embedding':
            # Implementation for preparing sentence embedding dataset
            if lg != 'english':
                col = 'translation'
            else:
                col = 'headline'
            ds = Dataset.from_dict({
                "id": df["id"],
                col: df[col]
            })
            ds = ds.map(lambda x: {
                col: clean_str_english(x[col]),
                'id': x['id']
            })
            ds = ds.map(lambda x: model.compute_length(x, text=col), batched=True).sort('length', reverse=True)
        return ds
    
    def translate(self,batch,device,text='headline'):
        tokenized_batch = self.translation_tokenizer(
            batch[text],
            padding="longest",
            truncation=True,
            max_length=self.MAX_LENGTH,
            return_tensors="pt"
        )
        inputs = {
            k: v.to(device) for k, v in tokenized_batch.items() 
            if k in self.translation_tokenizer.model_input_names
        }
        with torch.no_grad():
            translation = self.translation_model.generate(**inputs)
        return {
        'translation': self.translation_tokenizer.batch_decode(
            translation, skip_special_tokens=True
        )
    }

    def sentence_embeddings(self, batch, text='headline'):
        with torch.no_grad():
            embeddings = self.sentence_embedding_model.encode(batch[text])
        return {'embeddings': embeddings}

In [18]:
df_c=pd.read_parquet('/mnt/research-live/user/yzhong/Chinese_news_data_bloomberg/2023/04/bloomberg_news_Chinese_2023_04_17.parquet')
df_e=pd.read_parquet('/mnt/research-live/user/yzhong/English_news_data_bloomberg/2023/04/bloomberg_news_English_2023_04_17.parquet')

In [12]:
def find_matches_tickers(tickers,last_update,df_e):
    # Create an empty list to store matches
    matches = []
    # Find all rows in df_e that contain any of the tickers and have a delay of less than 2 hours
    con2 = (abs((last_update - df_e['last_update']).dt.total_seconds() / 60) < 120) #delay_condition
    ticker_matches = df_e[(df_e['tickers'].str.contains('|'.join(tickers))) & con2]
    # Append the ticker_matches to the list of matches
    matches.append(ticker_matches)
    # Concatenate all the matches and reset the index
    matches = pd.concat(matches).reset_index(drop=True)
    matches['delay']=abs((last_update - matches['last_update']).dt.total_seconds()) /60
    return matches

def assign_match_headline_cos(df_c,row,matrix,matches):
    sorted_indices = np.argsort(-matrix, axis=1)
    max_cos_indice = sorted_indices[0][0]
    df_c.at[row.Index, 'match_cs_headline'] = matches.at[max_cos_indice, 'headline']
    df_c.at[row.Index, 'match_cs_cos'] = matrix[0][max_cos_indice]
    df_c.at[row.Index, 'match_cs_date'] = matches.at[max_cos_indice, 'last_update']
    df_c.at[row.Index, 'match_cs_suid'] = matches.at[max_cos_indice, 'suid']
    
def assign_match_headline_faiss(df_c,row,index,score,matches):
    #sorted_indices = np.argsort(-matrix, axis=1)
    #max_cos_indice = sorted_indices[0][0]
    df_c.at[row.Index, 'match_faiss_headline'] = matches.at[index, 'headline']
    df_c.at[row.Index, 'match_faiss_cos'] = score
    df_c.at[row.Index, 'match_faiss_date'] = matches.at[index, 'last_update']
    df_c.at[row.Index, 'match_faiss_suid'] = matches.at[index, 'suid']
    




def assign_match_delay(matches, df_c, row):
    min_delay_index = matches['delay'].idxmin()
    df_c.at[row.Index, 'match_delay_headline'] = matches.at[min_delay_index, 'headline']
    df_c.at[row.Index, 'match_delay_date'] = matches.at[min_delay_index, 'last_update']
    df_c.at[row.Index, 'match_delay_suid'] = matches.at[min_delay_index, 'suid']

In [40]:
def match_headline_test(method):
    df_c=pd.read_parquet('/mnt/research-live/user/yzhong/Chinese_news_data_bloomberg/2023/04/bloomberg_news_Chinese_2023_04_17.parquet')
    df_e=pd.read_parquet('/mnt/research-live/user/yzhong/English_news_data_bloomberg/2023/04/bloomberg_news_English_2023_04_17.parquet')
    df_c['match']=False
    if method =='Laser':
        laser = Laser()
    elif method =='Translation':
        device = "cuda:0" if torch.cuda.is_available() else "cpu" 
        model = TranslationSentenceEmbeddingPipeline(config_['TRANSLATION_MODEL'], config_['SENTENCE_TRANSFORMER'],device)
        df_c,df_e=translation_embedding(df_c, df_e,model,device, batch_size_translate=32, batch_size_embeddings=256)
    else:
        raise ValueError("Invalid method. Must be either 'Laser' or 'Translation'.")
    for row in df_c.itertuples():
        tickers = list(set(row.tickers.split(', '))) # get all the unique tickers in a list
        last_update=row.last_update
        matches=find_matches_tickers(tickers,last_update,df_e)
        if len(matches) > 0:
            df_c.at[row.Index, 'match'] = True
            #assign_match_delay(matches, df_c, row)
            if method=='Laser':
                chinese_sentences=row.headline
                english_sentences=matches.headline.to_list()
                c=laser_cosine(chinese_sentences,english_sentences,laser)
            if method == 'Translation':
                chinese_emb=row.embedding
                english_emb=matches.embedding.to_list()
                c=translation_cosine(chinese_emb,english_emb)
            assign_match_headline_cos(df_c,row,c,matches)
            pass
        pass
    
    return df_c

In [20]:
def laser_faiss(chinese_sentence,english_sentences,laser):
    '''
    '''
    embeddings = laser.embed_sentences(english_sentences,lang='en')
    # Normalize embeddings to unit length
    normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1)[:, None]
    # Build an index using faiss
    index = faiss.IndexFlatIP(normalized_embeddings.shape[1])
    index.add(normalized_embeddings)
    # Perform a similarity search
    query = chinese_sentence
    query_embedding = laser.embed_sentences([query], lang=['zh'])[0]
    normalized_query_embedding = query_embedding / np.linalg.norm(query_embedding)
    D, I = index.search(normalized_query_embedding.reshape(1, -1), k=1)
    return I[0][0], D[0][0]

In [37]:
df_c=pd.read_parquet('/mnt/research-live/user/yzhong/Chinese_news_data_bloomberg/2023/04/bloomberg_news_Chinese_2023_04_17.parquet')
df_e=pd.read_parquet('/mnt/research-live/user/yzhong/English_news_data_bloomberg/2023/04/bloomberg_news_English_2023_04_17.parquet')

In [38]:
def match_headline_test_avectickers(df_c,df_e,method):

    #df_c['match']=False
    if method =='Laser':
        laser = Laser()
    elif method =='Translation':
        device = "cuda:0" if torch.cuda.is_available() else "cpu" 
        model = TranslationSentenceEmbeddingPipeline(config_['TRANSLATION_MODEL_ADVANCE'], config_['SENTENCE_TRANSFORMER'],device)
        df_c,df_e=translation_embedding(df_c, df_e,model,device, batch_size_translate=32, batch_size_embeddings=256)
    else:
        raise ValueError("Invalid method. Must be either 'Laser' or 'Translation'.")
    for row in df_c.itertuples():
        tickers = list(set(row.tickers.split(', '))) # get all the unique tickers in a list
        last_update=row.last_update
        matches=find_matches_tickers(tickers,last_update,df_e)
        if len(matches) > 0:
            df_c.at[row.Index, 'match'] = True
            #assign_match_delay(matches, df_c, row)
            if method=='Laser':
                chinese_sentence=preprocess_chinese_headline(row.headline) #row.headline
                english_sentences=matches.headline.apply(lambda x: clean_str_english(x)).to_list() 
                index,score=laser_faiss(chinese_sentence,english_sentences,laser)
                #c=laser_cosine(chinese_sentence,english_sentences,laser)
            if method == 'Translation':
                chinese_emb=row.embedding
                english_emb=matches.embedding.to_list()
                c=translation_cosine(chinese_emb,english_emb)
            assign_match_headline_faiss(df_c,row,index,score,matches)
            #assign_match_headline_cos(df_c,row,c,matches)
            pass
        pass
    
    return df_c

In [39]:
df_c=match_headline_test_avectickers(df_c,df_e,'Laser')

In [41]:
with open('/mnt/research-live/user/yzhong/match_similarity_laser_0417.txt', 'w') as f:
    for row in df_c.itertuples():
        f.write(str(row.match_faiss_headline) + '\n')

In [57]:
with open('/mnt/research-live/user/yzhong/match_similarity_translation_0417.txt', 'w') as f:
    for row in df_t.itertuples():
        f.write(str(row.translation) + '\n')

In [None]:
df_t=match_headline_test_avectickers('Translation')

In [None]:
with open('/mnt/research-live/user/yzhong/match_similarity_laser_0417.txt', 'w') as f:
    for row in df_c.itertuples():
        f.write(str(row.match_cs_headline) + '\n')

In [44]:
def match_max_cos (c,df_c,df_e):
    # Get the indices of the maximum value for each row
    max_indices = np.argmax(c, axis=1)
    # Create an empty dataframe to store the closest neighbour for each sentence
    sim_df = pd.DataFrame(columns=['id', 'Sim1', 'tickers1', 'date1', 'cos1'])
    # Loop over the rows of the `max_indices` array
    for i, max_index in enumerate(max_indices):
    # Get the index of the current row
        index = i
        # Get the corresponding row from `df_e`
        match = df_e.loc[df_e['id'] == max_index].reset_index(drop=True)
        # Create a new row to add to `sim_df`
        new_row = {'id': index, 'Sim1': match['headline'][0], 'tickers1': match['tickers'][0], 'date1': match['last_update'][0], 'cos1': c[i][max_index]}
        # Add the new row to `sim_df`
        sim_df = sim_df.append(new_row, ignore_index=True)
    merged_df = pd.merge(df_c, sim_df, on='id')

    return merged_df


In [45]:
def match_headline_test_sanstickers(df_c,df_e,method):
    #df_c['match']=False
    if method =='Laser':
        laser = Laser()
        chinese_sentences=df_c.headline.apply(lambda x: preprocess_chinese_headline(x)).to_list()
        english_sentences=df_e.headline.apply(lambda x: clean_str_english(x)).to_list()
        c=laser_cosine(chinese_sentences,english_sentences,laser)
        df_c=match_max_cos (c,df_c,df_e)
    elif method =='Translation':
        device = "cuda:0" if torch.cuda.is_available() else "cpu" 
        model = TranslationSentenceEmbeddingPipeline(config_['TRANSLATION_MODEL_ADVANCE'], config_['SENTENCE_TRANSFORMER'],device)
        df_c,df_e=translation_embedding(df_c, df_e,model,device, batch_size_translate=32, batch_size_embeddings=256)
        c=cosinus_similarity (df_c['embedding'].to_list(), df_e['embedding'].to_list())
        df_c=match_max_cos (c,df_c,df_e)
    return df_c

In [46]:
df_c=match_headline_test_sanstickers(df_c,df_e,'Laser')

In [101]:
with open('/mnt/research-live/user/yzhong/english_titles_0417.txt', 'w') as f:
    for row in df_e.itertuples():
        f.write(str(row.headline) + '\n')

In [125]:
df_c[df_c['headline']=='汇市/利率简报:美元连续第二日走高 升息预期提振美债收益率'].body.to_list()

['\n记者 Anya Andrianova\n     【彭博】-- 美元连续第二天上涨,美联储进一步收紧政策的可能性\n提升了美国国债收益率。所有G-10货币均兑美元下跌。\n     重点关注:\n\n  * 中国GDP\n  * 印尼央行利率决定\n\n     前日市场(截至纽约时间下午5点左右):\n\n  * 彭博美元即期指数上涨0.4%\n    * 对冲基金押注美元近三年来最长的周度跌势将\n      逆转\n  * 亚太区G-10货币\n    * 美元/日元涨0.5%至134.43\n    * 澳元/美元基本持平,\n      报0.6705\n    * 新西兰元/美元下跌0.3%至0.6186\n  * 10年期美国国债收益率上升8.2\n    个基点至3.59%。5年和30年期国债收益率差收窄1.4个基点至11.2个基点\n\n     前日人民币市场概况:\n\n  * 离岸人民币兑美元收盘下跌0.14%,报6.8827元\n  * 在岸人民币兑美元北京时\n    间16:30官方收盘价报6.8717元,夜盘最后成交于6.8817元,下跌0.18%\n  * 人\n    民币兑美元中间价报6.8679元,市场预测均值为6.8686元\n  * 近5日中间价较\n    预测均值平均偏强约7点,近20日中间价较预测均值平均偏强约7点\n\n     当日图表\n     要闻:\n\n  * 中国一季度经济数据即将发布 不均衡迹象或继续存在 \n  * 里士满联储行长:\n    希望看到更多显示通胀回落的证据\n  * 纽约联储调查:美国劳动力最低薪资要\n    求达到创纪录的76,000美元\n  * 第一信托银行难觅买主 问题出在那群富人客\n    户身上\n  * 美国3个月期国库券中标收益率超5% 创克林顿时期以来最高值 \n  * 嘉\n    信理财客户存款同比大降 高管自信能挺过暴风雨 \n  * 台湾将从美国采购400\n    枚岸基版“鱼叉”反舰导弹\n\n     重要观点:\n\n  * 美联储降息前景扑朔 高盛和道明各执一词\n  * IMF:中国将成为未来五年全球\n    最大经济增长源 贡献率是美国两倍\n  * 华尔街知名空头Wilson:债券收益率\n    上升或致标普500指数创新

In [100]:
df_e

Unnamed: 0,id,suid,last_update,language,analyst,date,tickers,eqt_codes,topics,headline,body
0,0,ROL7ICDWLU68,2023-04-17 14:12:52.026,ENGLISH,False,2023-04-17,"TSLA, TSLA","TSLA_UW, TSLA_UW","BIZNEWS, MSCIWORLD, NORTHAM, EQTY, DOT, CONSD,...",Tesla Reports Another Fatal US Crash Involving...,\nBy Keith Laing\n (Bloomberg) -- Tesla In...
1,1,RS0U67DWRGG0,2023-04-17 22:00:00.201,ENGLISH,False,2023-04-17,"2330@TT, 2330@TT, 2317@TT, 2317@TT, 2330@TT, 2...","TSM_UN, 2330_TT, 2317_TT, HHPD_LI, TSM_UN, 233...","UNREST, METALKEY, BIZNEWS, CNG, MSCIWORLD, NOR...",Why Taiwan’s Vote Matters From Beijing to Wash...,\nBy Cindy Wang and Samson Ellis\n (Bloomb...
2,2,RT09ZADWRGG0,2023-04-17 12:45:01.518,ENGLISH,False,2023-04-17,"GS, UBSG@SW, UBSG@SW, UBSG@SW, UBSG@SW, BX, GS...","GS_UN, UBSN_SW, UBS_UN, UBS_US, UBSG_VX, BX_US...","CORPFIN, BIZNEWS, BFWLOAEU, EQTY, FIN, BUSINES...",CVC Targets Wealthy Individuals in European Pr...,\nBy Silas Brown\n (Bloomberg) -- CVC Capi...
3,3,RT0TAFDWX2PS,2023-04-17 06:20:01.610,ENGLISH,False,2023-04-17,ASHM@LN,ASHM_LN,"ERNMAIN, BIZNEWS, MSCIWORLD, EQTY, FIN, ERN, A...",Ashmore Clients Pull $1.1 Billion in Seventh Q...,\nBy Nishant Kumar\n (Bloomberg) -- Ashmor...
4,4,RT40NFGFR4SG,2023-04-17 06:30:06.200,ENGLISH,False,2023-04-17,"UCG@IM, UCG@IM, UCG@IM, UCG@IM, UCG@IM, UCG@IM...","UC_IM, UCR_IM, UC_R_IM, UCGAA_IM, UCGAZ_IM, CR...","BELG, METALKEY, GRE, BIZNEWS, MSCIWORLD, ECOST...",Eurozone 3M Yield at 3.50% by End-2Q23 vs 3.60...,\nBy Harumi Ichikura\n (Bloomberg) -- The ...
...,...,...,...,...,...,...,...,...,...,...,...
2363,2363,RTA8T1T0AFB4,2023-04-17 23:22:07.866,ENGLISH,False,2023-04-17,"ANZ@AU, SGP@AU, SGP@AU, SGP@AU, NAB@AU, NAB@AU...","ANZ_AU, SGP_AU, SGPNA_AU, SGPNF_AU, NAB_AU, NA...","CORPFIN, BIZNEWS, MSCIWORLD, FIN, FIRPT, SYNBO...",LAUNCH: Stockland Trust A$ 7Y Bmark at Swaps +...,\nBy Finbarr Flynn\n (Bloomberg) -- STOCKL...
2364,2364,RTA956GFLIIO,2023-04-17 23:27:06.243,ENGLISH,True,2023-04-17,"INCY, INCY","INCY_UQ, INCY_UQ","ANACHANGE, BIZNEWS, ANA, MSCIWORLD, NORTHAM, E...",Incyte Rated New Hold at Baptista Research; PT...,\nBy Bloomberg Automation\n (Bloomberg) --...
2365,2365,RTA9A6GFA9Z4,2023-04-17 23:30:06.133,ENGLISH,True,2023-04-17,"WBA, WBA","WAG_UN, WAG_UN","ANACHANGE, BIZNEWS, ANA, MSCIWORLD, NORTHAM, E...",Walgreens Boots Rated New Hold at Baptista Res...,\nBy Bloomberg Automation\n (Bloomberg) --...
2366,2366,RTAA44GFLIIO,2023-04-17 23:48:04.082,ENGLISH,True,2023-04-17,"ATER, ATER","MWK_US, MWK_US","ANACHANGE, BIZNEWS, ANA, EQTY, BUSINESS, ANAMO...",Aterian Cut to Neutral at BTIG,\nBy Bloomberg Automation\n (Bloomberg) --...


In [14]:
def find_star (df):
    return ( df['headline'].str.contains(r'^\*[\u4e00-\u9fff0-9a-zA-Z].*'))
def get_TF(x, info_list):
    return any([True for i in info_list if i in str(x)])
    
def find_words(df, col_name='body', my_words=None):
    
    if my_words is None:
        my_words = ['原文标题','彭博自动新闻','原文標題']
        return (df[col_name].apply(lambda x: get_TF(x, my_words)))        
#df=df_sim.copy(deep=True)
# mask1=find_star(df_c)
# mask2=find_words(df_c)
# df_c['contains_star']=mask1
# df_c['contains_words']=mask2
# df_c['Only_Ch']= (~mask1) & (~mask2)

In [29]:
def match_headline(date,method):
    df_c=get_bloomberg_news('Chinese', date)
    df_e=get_bloomberg_news('English', date)
    df_c['match']=False
    if method =='Laser':
        laser = Laser()
    elif method =='Translation':
        device = "cuda:0" if torch.cuda.is_available() else "cpu" 
        model = TranslationSentenceEmbeddingPipeline(config_['TRANSLATION_MODEL'], config_['SENTENCE_TRANSFORMER'],device)
        df_c,df_e=translation_embedding(df_c, df_e,model,device, batch_size_translate=32, batch_size_embeddings=256)
    else:
        raise ValueError("Invalid method. Must be either 'Laser' or 'Translation'.")
    for row in df_c.itertuples():
        tickers = list(set(row.tickers.split(', '))) # get all the unique tickers in a list
        last_update=row.last_update
        matches=find_matches_tickers(tickers,last_update,df_e)
        if len(matches) > 0:
            df_c.at[row.Index, 'match'] = True
            assign_match_delay(matches, df_c, row)
            if len(matches) >1:
                if method=='Laser':
                    chinese_sentences=row.headline
                    english_sentences=matches.headline.to_list()
                    c=laser_cosine(chinese_sentences,english_sentences,laser)
                if method == 'Translation':
                    chinese_emb=row.embedding
                    english_emb=matches.embedding.to_list()
                    c=translation_cosine(chinese_emb,english_emb)
                assign_match_headline_cos(df_c,row,c,matches)
                pass
            pass
    
    return df_c

In [45]:
with open('/mnt/research-live/user/yzhong/headline_0417.txt', 'w') as f:
    for row in df_t.itertuples():
        f.write(str(row.match_delay_headline) + '\n')

In [None]:
def Save_bloomberg_news(lang, date):
    '''
    Save Bloomberg news data to a file in parquet format, dropping rows with null ticker values.

    Parameters:
    lang (str): The language of the news articles. Either 'English' or 'Chinese'.
    date (str): The date of the news articles in the format 'YYYY/MM/DD'.

    Returns:
    None
    '''
    # Extract year, month, and day from the date string
    year, month, day = date.split('/')
    
    # Construct the folder path where the data will be saved
    folder_path = f'{DATA_PATH}{lang}_news_data_bloomberg/{year}/{month}'
    
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Construct the file name and file path where the data will be saved
    file_name = f'bloomberg_news_{lang}_{year}_{month}_{day}.parquet'
    file_path = os.path.join(folder_path,file_name)
    
    # Load the news data for the specified language and date, drop rows with null ticker values, and reset the index
    if lang=='English':
        df = get_bloomberg_english_news(date).dropna(subset=['headline', 'tickers']).reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
    elif lang=='Chinese':
        df = get_bloomberg_chinese_news(date).dropna(subset=['headline', 'tickers']).reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
    else:
        raise ValueError("Invalid language. Must be either 'English' or 'Chinese'.")
    # Save the data to file in parquet format
    df.to_parquet(file_path)

def save_news(date):
    Save_bloomberg_news('English', date)
    Save_bloomberg_news('Chinese', date)

In [151]:
with open('/mnt/research-live/user/yzhong/match_similarity.txt', 'w') as f:
    for row in df_c.itertuples():
        f.write(str(row.headline) + '\n')
        f.write(str(row.translation) + '\n')
        f.write(f' Sim: {row.delay} {row.sim}   ' + '\n')
        f.write(f' Sim_cos: {row.delay_cos} {row.sim_cos} ' + '\n')

## Process with all dataset

In [17]:
df= pd.read_parquet('/mnt/research-live/user/yzhong/Chinese_news_data_t/2011/4/bloomberg_news_chinese_2011_4_5.parquet')

In [22]:
df= pd.read_parquet('/mnt/research-live/user/yzhong/English_news_data/2011/04/bloomberg_news_english_2011_04_05.parquet')

In [24]:
def assign_match_headline_cos(df_c,row,matrix,matches):

    ''' Assign match headline and cosine similarity values 

    Inputs:
    - df_c: DataFrame containing Chinese news headlines
    - row: current row of the DataFrame being processed
    - matrix: matrix containing cosine similarity scores between Chinese and English sentences
    - matches: DataFrame containing English news headlines that match the tickers and last_update of the current row

    Outputs: None
    '''
    sorted_indices = np.argsort(-matrix, axis=1)
    max_cos_indice = sorted_indices[0][0]
    df_c.at[row.Index, 'match_cs_headline'] = matches.at[max_cos_indice, 'headline']
    df_c.at[row.Index, 'match_cs_cos'] = matrix[0][max_cos_indice]
    df_c.at[row.Index, 'match_cs_date'] = matches.at[max_cos_indice, 'last_update']
    df_c.at[row.Index, 'match_cs_suid'] = matches.at[max_cos_indice, 'suid']

def match (df_c,df_e):
    ''' Find English news headlines that match the Chinese news headlines based on tickers, last_update date and cosine similarity

    Inputs:
    - df_c: DataFrame containing Chinese news headlines
    - df_e: DataFrame containing English news headlines

    Outputs:
    - df_c: Updated DataFrame with added columns for match headline cosine similarity, date, and suid
    '''
    # Set the match column in the DataFrame df_c to False for all rows
    df_c['match'] = False
    # Initialize the laserembeddings object
    laser = Laser()
    # Iterate through all rows in the DataFrame df_c
    for row in df_c.itertuples():
        # Get all the unique tickers in a list
        tickers = list(set(row.tickers.split(', ')))
        # Get the last update value for the row
        last_update = row.last_update
        # Find matches for the tickers and last_update value
        matches = find_matches_tickers(tickers, last_update, df_e)
        # If there are any matches, set the match column to True for the current row and assign match headline cosine similarity values
        if len(matches) > 0:
            df_c.at[row.Index, 'match'] = True
            chinese_sentences = row.headline
            english_sentences = matches.headline.to_list()
            cosine_similarities = laser_cosine(chinese_sentences, english_sentences,laser)
            assign_match_headline_cos(df_c, row, cosine_similarities, matches)
        pass 
    # Return the updated DataFrame df_c
    return df_c

In [25]:
# Define the paths to the Chinese and English news data folders
chinese_folder_path = '/mnt/research-live/user/yzhong/Chinese_news_data_t'
english_folder_path = '/mnt/research-live/user/yzhong/English_news_data'

# Loop over the Chinese news data folders
for year_folder in os.listdir(chinese_folder_path):
    if os.path.isdir(os.path.join(chinese_folder_path, year_folder)):
        for month_folder in os.listdir(os.path.join(chinese_folder_path, year_folder)):
            for file_name in os.listdir(os.path.join(chinese_folder_path, year_folder,month_folder)):
            # Check if the file is a parquet file and contains the string 'bloomberg_news_chinese'
                if file_name.endswith('.parquet') and 'bloomberg_news_chinese' in file_name:
                   
                    df_c=pd.read_parquet(os.path.join(chinese_folder_path, year_folder,month_folder,file_name))
                    print(os.path.join(chinese_folder_path, year_folder,month_folder,file_name))
                    del df_c['id']
                    df_c=df_c.reset_index().rename(columns={'index': 'id'})
                    #get the file path for the english news
                    year = year_folder
                    month = month_folder
                    month_str = f"{int(month):02d}"
                    day=file_name.split('_')[-1].split('.')[0]
                    day_str=f"{int(day):02d}"
     
                    english_folder_year_month = os.path.join(english_folder_path, year, month_str)

                    if os.path.exists(english_folder_year_month):
                        english_file=os.path.join(english_folder_year_month, f'bloomberg_news_english_{year}_{month_str}_{day_str}.parquet')
                        print(english_file)
                        df_e = pd.read_parquet(english_file) 
                        del df_e['id']
                        df_e=df_e.reset_index().rename(columns={'index': 'id'})
                    match_df=match(df_c,df_e)
                    # create the file path and name
                    file_name = f"/mnt/research-live/user/yzhong/Match_news/bloomberg_news_chinese_matching_{year}_{month_str}_{day_str}.parquet"
                    match_df.to_parquet(file_name)



/mnt/research-live/user/yzhong/Chinese_news_data_t/2018/11/bloomberg_news_chinese_2018_11_25.parquet
/mnt/research-live/user/yzhong/English_news_data/2018/11/bloomberg_news_english_2018_11_25.parquet
/mnt/research-live/user/yzhong/Chinese_news_data_t/2018/11/bloomberg_news_chinese_2018_11_18.parquet
/mnt/research-live/user/yzhong/English_news_data/2018/11/bloomberg_news_english_2018_11_18.parquet
/mnt/research-live/user/yzhong/Chinese_news_data_t/2018/11/bloomberg_news_chinese_2018_11_16.parquet
/mnt/research-live/user/yzhong/English_news_data/2018/11/bloomberg_news_english_2018_11_16.parquet
/mnt/research-live/user/yzhong/Chinese_news_data_t/2018/11/bloomberg_news_chinese_2018_11_2.parquet
/mnt/research-live/user/yzhong/English_news_data/2018/11/bloomberg_news_english_2018_11_02.parquet
/mnt/research-live/user/yzhong/Chinese_news_data_t/2018/11/bloomberg_news_chinese_2018_11_6.parquet
/mnt/research-live/user/yzhong/English_news_data/2018/11/bloomberg_news_english_2018_11_06.parquet
/m

## Test with LASER

https://github.com/facebookresearch/LASER

In [None]:
del df_c['id']
df_c=df_c.reset_index().rename(columns={'index': 'id'})
del df_e['id']
df_e=df_e.reset_index().rename(columns={'index': 'id'})

#### Using Laser

In [None]:
from laserembeddings import Laser

laser = Laser()
chinese_sentences=df_c['headline'].to_list()
english_sentences=df_e['headline'].to_list()

In [None]:
chinese_emb=laser.embed_sentences(chinese_sentences,lang='zh')
english_emb=laser.embed_sentences(english_sentences,lang='en')
english_emb=torch.tensor(english_emb)
chinese_emb=torch.tensor(chinese_emb)

#### Using translation embedding

In [None]:
chinese_emb=torch.tensor(df_c['embedding'].to_list())
english_emb=torch.tensor(df_e['embedding'].to_list())

#### Calculate cosinus similarity

In [None]:
c=cosinus_similarity (chinese_emb, english_emb)
c=c.numpy()
# Get the indices that would sort each row in descending order
sorted_indices = np.argsort(-c, axis=1) # this will sort in descending values
# Get the indices of the top three values for each row
top_three_indices = sorted_indices[:, :3]
# Create an empty dataframe to store the three closest neighbours for each sentences
sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3','tickers1','tickers2','tickers3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])

# Loop over the rows of the `top_three_indices` array
for i, top_three in enumerate(top_three_indices):
    # Get the index of the current row
    index = i
    # Get the corresponding rows from `df_e`
    matches = df_e.loc[df_e['id'].isin(top_three)]
    matches = matches.reindex(index=top_three).reset_index(drop=True)
    
    # Create a new row to add to `result_df`
    new_row= {'id': index}
    
    # Loop over the matches and add their headlines and dates to the new row
    for j, match_row in matches.iterrows():
        new_row[f'Sim{j+1}'] = match_row['headline']
        new_row[f'date{j+1}'] = match_row['last_update']
        new_row[f'cos{j+1}'] = c[i][top_three[j]]
        new_row[f'tickers{j+1}'] = match_row['tickers']
    
    # Add the new row to `sim_df`
    sim_df = sim_df.append(new_row, ignore_index=True)

In [None]:
merged_df = pd.merge(df_c, sim_df, on='id')
merged_df.to_parquet('/mnt/research-live/user/yzhong/Translation_similarity_2012_1_10.parquet')
df_sim=pd.read_parquet("/mnt/research-live/user/yzhong/Translation_similarity_2012_1_10.parquet")

In [None]:
with open('/mnt/research-live/user/yzhong/Translation_similarity_2012_1_10.txt', 'w') as f:
    for row in merged_df.itertuples():
        f.write(str(row.headline) + '\n')
        f.write(str(row.tickers) + '\n')
        f.write(str(row.translation) + '\n')
        f.write(str(row.tickers1) + '\n')
        f.write(f' Sim1: {row.cos1}  {row.Sim1}' + '\n')
        f.write(str(row.tickers2) + '\n')
        f.write(f' Sim2: {row.cos2} {row.Sim2}' + '\n')
        f.write(str(row.tickers3) + '\n')
        f.write(f' Sim3: {row.cos3} {row.Sim3}' + '\n')

In [None]:
def sim_dataframe (similarity_matrix,df_e,df_c):
    # Create an empty dataframe to store the three closest neighbours for each sentences
    sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])
    for i, dic in enumerate(similarity_matrix):
        index=i
        top_three=[dic[x]['corpus_id'] for x in range(3)]
        top_three_score=[dic[x]['score'] for x in range(3)]
        # Get the corresponding rows from `df_e`
        print(df_e.columns)
        matches = df_e.loc[df_e['id'].isin(top_three)]
        matches = matches.reindex(index=top_three).reset_index(drop=True)
        # Create a new row to add to `result_df`
        new_row= {'id': index}

        # Loop over the matches and add their headlines and dates to the new row
        for j, match_row in matches.iterrows():
            new_row[f'Sim{j+1}'] = match_row['headline']
            new_row[f'date{j+1}'] = match_row['last_update']
            new_row[f'cos{j+1}'] = top_three_score[j]
        
        # Add the new row to `sim_df`
        sim_df = sim_df.append(new_row, ignore_index=True)
        print(df_c.columns)
        merged_df = pd.merge(df_c, sim_df, on='id')

    return merged_df

In [None]:
merged_df = pd.merge(df_c_t, sim_df, on='id')
merged_df.to_parquet('translated_similarity_2016_03_01.parquet')
df_sim=pd.read_parquet("translated_similarity_2016_03_01.parquet")

In [None]:
with open('translated_similarity.txt', 'w') as f:
    for row in merged_df.itertuples():
        f.write(str(row.headline) + '\n')
        f.write(str(row.translation) + '\n')
        f.write(f' Sim1: {row.cos1}  {row.Sim1}' + '\n')
        f.write(f' Sim2: {row.cos2} {row.Sim2}' + '\n')
        f.write(f' Sim3: {row.cos3} {row.Sim3}' + '\n')

In [None]:
chinese_emb.shape # shape1 sentences shape2 dimensions of embeddings 

(302, 1024)

Read data

In [None]:
def sim_dataframe (similarity_matrix,df_e,df_c):
    # Create an empty dataframe to store the three closest neighbours for each sentences
    sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])
    for i, dic in enumerate(similarity_matrix):
        index=i
        top_three=[dic[x]['corpus_id'] for x in range(3)]
        top_three_score=[dic[x]['score'] for x in range(3)]
        # Get the corresponding rows from `df_e`
        print(df_e.columns)
        matches = df_e.loc[df_e['id'].isin(top_three)]
        matches = matches.reindex(index=top_three).reset_index(drop=True)
        # Create a new row to add to `result_df`
        new_row= {'id': index}

        # Loop over the matches and add their headlines and dates to the new row
        for j, match_row in matches.iterrows():
            new_row[f'Sim{j+1}'] = match_row['headline']
            new_row[f'date{j+1}'] = match_row['last_update']
            new_row[f'cos{j+1}'] = top_three_score[j]
        
        # Add the new row to `sim_df`
        sim_df = sim_df.append(new_row, ignore_index=True)
        print(df_c.columns)
        merged_df = pd.merge(df_c, sim_df, on='id')

    return merged_df

In [None]:
def find_match(df1,df2):
    #df1 chinese df2 english
    #df1.dropna(inplace=True)
    #df1.reset_index(drop=True, inplace=True)
    #df1=df1.reset_index().rename(columns={'index': 'id'})
    #print(df1.columns)

    df2.dropna(inplace=True)
    df2.reset_index(drop=True, inplace=True)
    df2=df2.reset_index().rename(columns={'index': 'id'})

    df1.loc[:,"translation"] = df1.translation.apply(lambda x: clean_str(x))
    df2.loc[:,"headline"] = df2.headline.apply(lambda x: clean_str(x))
    ds_t = Dataset.from_dict(df1)
    ds_e = Dataset.from_dict(df2)
    cols_to_remove = ds_t.column_names
    cols_to_remove.remove("translation")
    cols_to_remove.remove("id")
    ds_t1=ds_t.remove_columns(cols_to_remove)
    cols_to_remove = ds_e.column_names
    cols_to_remove.remove("headline")
    cols_to_remove.remove("id")
    ds_e1=ds_e.remove_columns(cols_to_remove)
    ds_t1 = ds_t1.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
    ds_t1 = ds_t1.map(lambda x: sentence_embeddings(x ,model, text='translation'), batched=True, batch_size=batch_size).sort('id')
    ds_e1 = ds_e1.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
    ds_e1 = ds_e1.map(lambda x: sentence_embeddings(x, model,text='headline'), batched=True, batch_size=batch_size).sort('id')
    translated_input_ids=ds_t1['embeddings']
    originated_input_ids=ds_e1['embeddings']
    emb1=torch.tensor(translated_input_ids)
    emb2=torch.tensor(originated_input_ids)
    similarity_matrix=util.semantic_search(emb1, emb2, top_k=3)
    merge_df=sim_dataframe (similarity_matrix,df2,df1)
    return merge_df


In [None]:
df_c=pd.read_parquet('/mnt/research-live/user/yzhong/Chinese news data/2011/bloomberg_news_chinese_2011_4.parquet')
import glob
import pandas as pd
file_list = glob.glob('/mnt/research-live/user/yzhong/English_news_data/2011/04/*.parquet')
dfs = []
for file_path in file_list:
    df = pd.read_parquet(file_path)
    dfs.append(df)
df_e = pd.concat(dfs, axis=0)

In [None]:
df_c.loc[:,"translation"] = df_c.translation.apply(lambda x: clean_str(x))
df_e.loc[:,"headline"] = df_e.headline.apply(lambda x: clean_str(x))

In [None]:
df_e.dropna(inplace=True)
df_e.reset_index(drop=True, inplace=True)
df_e=df_e.reset_index().rename(columns={'index': 'id'})

In [None]:
ds_t = Dataset.from_dict(df_c)
ds_e = Dataset.from_dict(df_e)
ne=len(ds_e['headline'])
nt=len(ds_t['translation'])
print (f'English news: {ne}')
print (f'Chinese news: {nt}')

In [None]:
cols_to_remove = ds_t.column_names
cols_to_remove.remove("translation")
cols_to_remove.remove("id")
ds_t1=ds_t.remove_columns(cols_to_remove)
ds_t1
cols_to_remove = ds_e.column_names
cols_to_remove.remove("headline")
cols_to_remove.remove("id")
ds_e1=ds_e.remove_columns(cols_to_remove)
ds_e1

In [None]:
ds_t1 = ds_t1.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
ds_t1 = ds_t1.map(lambda x: sentence_embeddings(x ,model, text='translation'), batched=True, batch_size=batch_size).sort('id')
ds_e1 = ds_e1.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
ds_e1 = ds_e1.map(lambda x: sentence_embeddings(x, model,text='headline'), batched=True, batch_size=batch_size).sort('id')

In [None]:
translated_input_ids=ds_t1['embeddings']
originated_input_ids=ds_e1['embeddings']
emb1=torch.tensor(translated_input_ids)
emb2=torch.tensor(originated_input_ids)
similarity_matrix=util.semantic_search(emb1, emb2, top_k=3)

In [None]:
similarity_matrix

In [None]:
# Create an empty dataframe to store the three closest neighbours for each sentences
sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3', 'date1', 'date2', 'date3','cos1', 'cos2', 'cos3'])
for i, dic in enumerate(similarity_matrix):
    index=i
    top_three=[dic[x]['corpus_id'] for x in range(3)]
    top_three_score=[dic[x]['score'] for x in range(3)]
    # Get the corresponding rows from `df_e`
    matches = df_e.loc[df_e['id'].isin(top_three)]
    matches = matches.reindex(index=top_three).reset_index(drop=True)
    # Create a new row to add to `result_df`
    new_row= {'id': index}

    # Loop over the matches and add their headlines and dates to the new row
    for j, match_row in matches.iterrows():
        new_row[f'Sim{j+1}'] = match_row['headline']
        new_row[f'date{j+1}'] = match_row['last_update']
        new_row[f'cos{j+1}'] = top_three_score[j]
    
    # Add the new row to `sim_df`
    sim_df = sim_df.append(new_row, ignore_index=True)
    

In [None]:
sim_df 

In [None]:
import os
import pandas as pd

# Create the folder to store the monthly data
os.makedirs("/mnt/live/user/yzhong/Chinese_news_data_t", exist_ok=True)

# Read the data
df = pd.read_parquet("/mnt/research-live/user/yzhong/bloomberg_news_chinese_trickers_translation_emb.parquet")

# Extract year and month from the 'date' column
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day


for year in df['year'].unique():
    year_folder_path = os.path.join("/mnt/live/user/yzhong/Chinese_news_data_t", str(year))
    os.makedirs(year_folder_path, exist_ok=True)
    for month in df.loc[df['year'] == year, 'month'].unique():
        month_folder_path = os.path.join("/mnt/live/user/yzhong/Chinese_news_data_t",str(year), str(month))
        os.makedirs(month_folder_path, exist_ok=True)
        #print(month_folder_path)
        for day in df.loc[(df['year'] == year)&(df['month'] == month), 'day'].unique():
            #print(day)
            day_df = df.loc[(df['year'] == year) & (df['month'] == month)&(df['day'] == day)]
            del day_df['year']
            del day_df['month']
            del day_df['day']
            filename = f"bloomberg_news_chinese_{str(year)}_{str(month)}_{str(day)}.parquet"
            filepath = os.path.join(month_folder_path, filename)
            day_df.to_parquet(filepath)
