### Import the package

In [1]:
# en python:
import os
os.environ["https_proxy"] = "http://proxy.fr.cfm.fr:6060"

In [2]:
#from nltk.tokenize import sent_tokenize
from datasets import Dataset
import numpy as np
import pandas as pd
import regex as re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, BertModel

In [26]:
from sentence_transformers import SentenceTransformer

### Prepare the dataset

#### Read dataset

In [3]:
chinese_data = {'headline': ['雷诺CEO认为对中国增长的担忧过度', '吉祥航空股价在上海上涨1.65报21.60元人民币', '制药公司Valeant称正接受美国证券交易委员会调查股价大跌','嘉能可的目标是通过资产出售再筹资40亿50亿美元']}
df_c = pd.DataFrame(chinese_data)
# English dataset
english_data = {'headline': ['*RENAULT CEO SEES CHINA GROWTH CONCERNS OVERSTATED' ,'*GLENCORE TARGETING ADDITIONAL $4-5 BILLION FROM ASSET SALES','*JUNEYAO AIRLINES SHARES RISE 1.65% TO 21.60 YUAN IN SHANGHA', "*VALEANT PHARMA ERASES LOSS, SHARES UP AS MUCH AS 3%",'*SUNRAIN SOLAR SHARES RISE 1.03% TO 6.84 YUAN IN SHANGHAI', '*POLY REAL ESTATE SHARES RISE 1.10% TO 9.18 YUAN IN SHANGHA', "Growth at More Reasonable Price Emerges With China's Stock Slump","*DAVID HERRO SAYS CHINA ECONOMY WILL BE FINE MEDIUM TO LONG-TERM",' Valeant Shares Decline on News of SEC Investigation','Gretchen K Zech, SVP, Globa, Sells 7,500 ARW US 02/29/16']}
df_e = pd.DataFrame(english_data)

#### Data cleaning

Clean the headlilne column so the text line does not contain strange symbols, for Chinese, also needs to remove the unnecessary space. 

Remove space

In [4]:
#remove space
def remove_space(text):
    text=text.replace(' ','')
    return text

In [6]:
# remove extra spaces for enlish : 
def remove_extra(text):
    clean_text = re.sub(r'\s+', ' ', text)
    return clean_text

Chinese punctuation size is different from English, which may cause confusion when doing regex and translation tasks

In [5]:
#Full-width to half-width, the chinese language the 
def full_to_half(sentence):      
    change_sentence=""
    for word in sentence:
        inside_code=ord(word)
        if inside_code==12288:    #Direct conversion of full-width spaces
            inside_code=32
        elif inside_code>=65281 and inside_code<=65374:  #Full-width characters (except spaces) are converted according to the relationship
            inside_code-=65248
        change_sentence+=chr(inside_code)
    return change_sentence

Some useless informations to remove: things in paratheses "(" "[", the * symbol

In [7]:
def remove_special_chars(text):
    special_chars = r'[&*+\-\/<=>?@\^_|~]'
    # replace special characters with an empty string
    cleaned_text = re.sub(special_chars, '', text)
    pattern = r'\(.*?\)' # remove things in paratheses
    cleaned_text = re.sub(pattern, '', cleaned_text)
    return cleaned_text

In [8]:
def Process_clean(df,lg='zh'):
    if lg=='zh': 

        df.loc[:,"headline"] = df.headline.apply(lambda x: full_to_half(x))

        
    df.loc[:,"headline"] = df.headline.apply(lambda x: remove_special_chars(x))

    if lg=='zh':
        df.loc[:,"headline"] = df.headline.apply(lambda x: remove_space(x))
    return df

Apply data cleaning on both Chinese and English dataset

In [9]:
df_c=Process_clean(df_c, lg='zh')
df_e=Process_clean(df_e, lg='en')

In [11]:
df_e

Unnamed: 0,headline
0,RENAULT CEO SEES CHINA GROWTH CONCERNS OVERSTATED
1,GLENCORE TARGETING ADDITIONAL $45 BILLION FROM...
2,JUNEYAO AIRLINES SHARES RISE 1.65% TO 21.60 YU...
3,"VALEANT PHARMA ERASES LOSS, SHARES UP AS MUCH ..."
4,SUNRAIN SOLAR SHARES RISE 1.03% TO 6.84 YUAN I...
5,POLY REAL ESTATE SHARES RISE 1.10% TO 9.18 YUA...
6,Growth at More Reasonable Price Emerges With C...
7,DAVID HERRO SAYS CHINA ECONOMY WILL BE FINE ME...
8,Valeant Shares Decline on News of SEC Investi...
9,"Gretchen K Zech, SVP, Globa, Sells 7,500 ARW U..."


### Translation Model

#### Helper functions

Calculate the length of the sentences so the batches will be patched efficiently

In [12]:
def compute_length(batch, text='headline'):
    """
    compute_length:
        Computes the length of a text (nb. of characters).
        
    Args:
        batch ():
        text (str): Which key gives the text.
    
    Returns:
        The new dataset with the length of the string of text.
    """
    return {
        'length': [len(item) for item in batch[text]]
    }

Translate function

In [13]:
def translate(batch, tokenizer, model, text='headline'):
    """
    translate
    
    Args:
        batch ():
        tokenizer (Transformers.tokenizer):
        model (Transformers.model):
        
    Returns:
        The new dataset with a 'translation' column that have the translated text.
    """
    tokenized_batch = tokenizer(
        batch[text],
        padding="longest",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    with torch.no_grad():
        translation = model.generate(**tokenized_batch)
    return {
        'translation': tokenizer.batch_decode(translation, skip_special_tokens=True)
    }


#### Create the model

In [14]:
# Load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
batch_size : int = 1

Prepare dataset

In [15]:
#add a column with index as id
df_c=df_c.reset_index().rename(columns={'index': 'id'})
ds = Dataset.from_dict(df_c)

#### Apply translation model to dataset in batches

In [16]:
ds = ds.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
ds = ds.map(lambda x: translate(x, tokenizer, model, text='headline'), batched=True, batch_size=batch_size).sort('id')


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]



#### Write results into file

In [17]:
ds.set_format("pandas")
df_c_t = ds[:]

In [21]:
df_c_t

Unnamed: 0,id,headline,length,translation
0,0,雷诺CEO认为对中国增长的担忧过度,17,RenoCeo thinks that China's growth worries are...
1,1,吉祥航空股价在上海上涨1.65报21.60元人民币,25,Hsiang Air stock price rises in Shanghai by 1....
2,2,制药公司Valeant称正接受美国证券交易委员会调查股价大跌,30,"Valeant, the pharmaceutical company, says it's..."
3,3,嘉能可的目标是通过资产出售再筹资40亿50亿美元,24,Gretchen's goal is to raise $4.5 billion more ...


### Similarity Model

#### Cleaning the data

An extra step of data cleaning need to be apply here since now we want to obtain here unlike the one for translation. This step can remove more unnecessary informationis.

In [18]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r'\(.*?\)', " \'d", string)
     # remove things in paratheses
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()

In [19]:
df_c_t.loc[:,"translation"] = df_c_t.translation.apply(lambda x: clean_str(x))

In [20]:
df_c_t

Unnamed: 0,id,headline,length,translation
0,0,雷诺CEO认为对中国增长的担忧过度,17,renoceo thinks that china 's growth worries ar...
1,1,吉祥航空股价在上海上涨1.65报21.60元人民币,25,hsiang air stock price rises in shanghai by 1....
2,2,制药公司Valeant称正接受美国证券交易委员会调查股价大跌,30,"valeant , the pharmaceutical company , says it..."
3,3,嘉能可的目标是通过资产出售再筹资40亿50亿美元,24,gretchen 's goal is to raise $4.5 billion more...


In [21]:
df_e=df_e.reset_index().rename(columns={'index': 'id'})

In [22]:
df_e.loc[:,"headline"] = df_e.headline.apply(lambda x: clean_str(x))

#### Tokenize the sentences

Prepare the dataset

In [23]:
ds_t = Dataset.from_dict(df_c_t)
ds_e = Dataset.from_dict(df_e)

In [24]:
ne=len(ds_e['headline'])
nt=len(ds_t['translation'])
print (f'English news: {ne}')
print (f'Chinese news: {nt}')

English news: 10
Chinese news: 4


Prepare the model

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [28]:
# Tokenize the sentences and convert them to input IDs
def sentence_tokenization(batch, text='translation'):
   # inputs = tokenizer(batch[text], padding= True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        embeddings = model.encode(batch[text])
    return {'embeddings': embeddings}

In [44]:
# Tokenize the sentences and convert them to input IDs
def tokenize_and_encode(batch,tokenizer, text='translation'):
    inputs = tokenizer(batch[text], padding= True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        embeddings = outputs.last_hidden_state.mean(dim=1)
        #translated_embeddings.append(embeddings)
    return {'embeddings': embeddings.tolist()}

    #return {'input_ids':inputs['input_ids'].tolist(), 'attention_mask':inputs['attention_mask'].tolist()}
    #return {'input_ids':inputs['input_ids'], 'attention_mask':inputs['attention_mask']}

In [29]:
# Use the map method to apply the mapping function to the dataset in batches
batch_size =1
ds_t = ds_t.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
ds_t = ds_t.map(lambda x: sentence_tokenization(x, text='translation'), batched=True, batch_size=batch_size).sort('id')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [30]:
ds_e = ds_e.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
ds_e = ds_e.map(lambda x: sentence_tokenization(x, text='headline'), batched=True, batch_size=batch_size).sort('id')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [47]:
# # Use the map method to apply the mapping function to the dataset in batches
# batch_size =1
# ds_t = ds_t.map(lambda x: compute_length(x, text='translation'), batched=True).sort('length', reverse=True)
# ds_t = ds_t.map(lambda x: tokenize_and_encode(x,tokenizer, text='translation'), batched=True, batch_size=batch_size).sort('id')
# ds_e = ds_e.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)
# ds_e = ds_e.map(lambda x: tokenize_and_encode(x,tokenizer, text='headline'), batched=True, batch_size=batch_size).sort('id')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Retrive the embeddings into list

In [31]:
translated_input_ids=ds_t['embeddings']
originated_input_ids=ds_e['embeddings']

Turn the embedding into tensors in order to calculate the similarity matrix

In [32]:
emb1=torch.tensor(translated_input_ids)
emb2=torch.tensor(originated_input_ids)

Expand the dimensions of the embeddings so that they can be broadcasted across each other.

In [34]:
import torch.nn.functional as F
def cosinus_similarity (emb1, emb2):
# cosine similarity = normalize the vectors & multiply
    C = F.normalize(emb1) @ F.normalize(emb2).t()
    return C

In [35]:
C=cosinus_similarity (emb1, emb2)

In [None]:
target_sentence = "This is the targeted sentence."

# Select the targeted sentence and its three most similar neighbors
neighbors = df.loc[df['original_sentence'].isin([target_sentence] + list(similar_sentences.keys()))]

# Calculate the sentence embeddings and cosine similarity scores
target_embedding = sentence_tokenization({'translation': target_sentence})['embeddings']
neighbor_embeddings = [sentence_tokenization({'translation': neighbor})['embeddings'] for neighbor in neighbors['original_sentence']]
similarity_scores = [F.cosine_similarity(target_embedding, neighbor_embedding).item() for neighbor_embedding in neighbor_embeddings]

# Sort the neighbors in descending order based on their similarity scores
sorted_neighbors = neighbors.iloc[(-np.array(similarity_scores)).argsort()]

In [37]:
# Get the indices that would sort each row in descending order
sorted_indices = np.argsort(-C, axis=1) # this will sort in descending values
# Get the indices of the top three values for each row
top_three_indices = sorted_indices[:, :3]

In [42]:
# Create an empty dataframe to store the three closest neighbours for each sentences
sim_df = pd.DataFrame(columns=['id', 'Sim1', 'Sim2', 'Sim3','cos1', 'cos2', 'cos3'])

# Loop over the rows of the `top_three_indices` array
for i, top_three in enumerate(top_three_indices):
    # Get the index of the current row
    index = i
    # Get the corresponding rows from `df_e`
    matches = df_e.loc[df_e['id'].isin(top_three)]
    matches = matches.reset_index(drop=True)
    
    # Create a new row to add to `result_df`
    new_row= {'id': index}
    
    # Loop over the matches and add their headlines and dates to the new row
    for j, match_row in matches.iterrows():
        new_row[f'Sim{j+1}'] = match_row['headline']
        #new_row[f'date{j+1}'] = match_row['last_update']
        new_row[f'cos{j+1}'] = C[i][top_three[j]]
    
    # Add the new row to `sim_df`
    sim_df = sim_df.append(new_row, ignore_index=True)

In [43]:
merged_df = pd.merge(df_c_t, sim_df, on='id')

In [44]:
merged_df

Unnamed: 0,id,headline,length,translation,Sim1,Sim2,Sim3,cos1,cos2,cos3
0,0,雷诺CEO认为对中国增长的担忧过度,17,renoceo thinks that china 's growth worries ar...,renault ceo sees china growth concerns overstated,growth at more reasonable price emerges with c...,david herro says china economy will be fine me...,tensor(0.6291),tensor(0.5457),tensor(0.5111)
1,1,吉祥航空股价在上海上涨1.65报21.60元人民币,25,hsiang air stock price rises in shanghai by 1....,juneyao airlines shares rise 1.65% to 21.60 yu...,sunrain solar shares rise 1.03% to 6.84 yuan i...,growth at more reasonable price emerges with c...,tensor(0.5103),tensor(0.5038),tensor(0.4598)
2,2,制药公司Valeant称正接受美国证券交易委员会调查股价大跌,30,"valeant , the pharmaceutical company , says it...","valeant pharma erases loss , shares up as much...",growth at more reasonable price emerges with c...,valeant shares decline on news of sec investig...,tensor(0.6010),tensor(0.5998),tensor(0.3350)
3,3,嘉能可的目标是通过资产出售再筹资40亿50亿美元,24,gretchen 's goal is to raise $4.5 billion more...,glencore targeting additional $45 billion from...,poly real estate shares rise 1.10% to 9.18 yua...,"gretchen k zech , svp , globa , sells 7 , 500 ...",tensor(0.4304),tensor(0.3916),tensor(0.2375)


In [73]:
# Tokenize the sentences and convert them to input IDs
def sentence_tokenization_n(text):
   # inputs = tokenizer(batch[text], padding= True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        embeddings = model.encode(text)
    return torch.tensor(embeddings).unsqueeze(0)

In [71]:
emb_targ=sentence_tokenization_n('i have a dream')

In [74]:
emb_n=sentence_tokenization_n('i have a bad dream')

In [63]:
F.cosine_similarity(emb_targ.unsqueeze(0), emb_n.unsqueeze(0)).item()

0.7803442478179932

In [None]:
cosinus_similarity(target_embedding, neighbor_embedding)

In [85]:

def reorder_neighbors(df, sim1_col='Sim1', sim2_col='Sim2', sim3_col='Sim3', target_col='translation'):
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Select the target sentence and its three most similar neighbors
        target_sentence = row[target_col]
        sim1 = row[sim1_col]
        sim2 = row[sim2_col]
        sim3 = row[sim3_col]
        
        neighbors = [target_sentence, sim1, sim2, sim3]

        # Calculate the sentence embeddings and cosine similarity scores
        target_embedding = sentence_tokenization_n(target_sentence)
        neighbor_embeddings = [sentence_tokenization_n(neighbor) for neighbor in neighbors]
        similarity_scores = [cosinus_similarity(target_embedding, neighbor_embedding).item() for neighbor_embedding in neighbor_embeddings]

        # Sort the neighbors in descending order based on their similarity scores
        sorted_neighbors = [(neighbor,score) for score, neighbor in sorted(zip(similarity_scores, neighbors), reverse=True)]
        
        # Update the dataframe with the reordered neighbors Similar to loc, in that both provide label-based lookups. Use at if you only need to get or set a single value in a DataFrame or Series.
        df.at[index, sim1_col] = sorted_neighbors[0][1]
        df.at[index, sim2_col] = sorted_neighbors[1][1]
        df.at[index, sim3_col] = sorted_neighbors[2][1]
        
    return df

In [86]:
reorder_neighbors(merged_df)

Unnamed: 0,id,headline,length,translation,Sim1,Sim2,Sim3
0,0,雷诺CEO认为对中国增长的担忧过度,17,renoceo thinks that china 's growth worries ar...,1,1,0.629124
1,1,吉祥航空股价在上海上涨1.65报21.60元人民币,25,hsiang air stock price rises in shanghai by 1....,1,1,0.51029
2,2,制药公司Valeant称正接受美国证券交易委员会调查股价大跌,30,"valeant , the pharmaceutical company , says it...",1,1,0.60104
3,3,嘉能可的目标是通过资产出售再筹资40亿50亿美元,24,gretchen 's goal is to raise $4.5 billion more...,1,1,0.43038


In [53]:
merged_df

Unnamed: 0,id,headline,length,translation,Sim1,Sim2,Sim3
0,0,雷诺CEO认为对中国增长的担忧过度,17,renoceo thinks that china 's growth worries ar...,renault ceo sees china growth concerns overstated,growth at more reasonable price emerges with c...,david herro says china economy will be fine me...
1,1,吉祥航空股价在上海上涨165报2160元人民币,23,hsiang air 's share price rose by 165 yuan in ...,juneyao airlines shares rise 1.65% to 21.60 yu...,sunrain solar shares rise 1.03% to 6.84 yuan i...,poly real estate shares rise 1.10% to 9.18 yua...
2,2,制药公司Valeant称正接受美国证券交易委员会调查股价大跌,30,"valeant , the pharmaceutical company , says it...","valeant pharma erases loss , shares up as much...",growth at more reasonable price emerges with c...,valeant shares decline on news of sec investig...
3,3,嘉能可的目标是通过资产出售再筹资40亿50亿美元,24,gretchen 's goal is to raise $4.5 billion more...,glencore targeting additional $45 billion from...,poly real estate shares rise 1.10% to 9.18 yua...,"gretchen k zech , svp , globa , sells 7 , 500 ..."


In [54]:
with open('test_sentence_embedding.txt', 'w') as f:
    for row in merged_df.itertuples():
        f.write(str(row.headline) + '\n')
        f.write(str(row.translation) + '\n')
        f.write(f' Sim1:  {row.Sim1}' + '\n')
        f.write(f' Sim2:  {row.Sim2}' + '\n')
        f.write(f' Sim3:  {row.Sim3}' + '\n')

In [57]:
import torch.nn.functional as F

C = F.cosine_similarity(emb1.unsqueeze(1), emb2, dim=-1)

In [53]:
# Compute the cosine similarity between the sentence embeddings
similarity_matrix = torch.nn.functional.cosine_similarity(emb1,emb2, dim=1)

#### Write the similarity matrix into numpy file

In [205]:
# Load the .npy file
cosine_similarities = np.load("cosine_similarities.npy")

# Load the .npz file
with np.load("cosine_similarities.npz") as data:
    cosine_similarities = data["arr_0"]

In [206]:
cosine_similarities

array([[-0.46736145,  0.6427436 ,  0.7512598 , ..., -0.5311961 ,
         0.6429858 , -0.24412113],
       [ 0.41588545,  0.6427436 ,  0.7512597 , ...,  0.47972023,
         0.6429858 ,  0.19264519],
       [ 0.41588545,  0.6427436 ,  0.7512598 , ...,  0.47972012,
         0.6429857 ,  0.19264519],
       ...,
       [ 0.41588545,  0.6427436 ,  0.7512598 , ...,  0.47972012,
         0.6429858 , -0.24412113],
       [ 0.41588545,  0.6427436 ,  0.7512598 , ...,  0.47972012,
         0.6429858 ,  0.19264519],
       [ 0.41588545,  0.6427436 ,  0.7512598 , ...,  0.47972012,
         0.6429857 ,  0.19264519]], dtype=float32)

In [40]:
# Example usage
news_titles = ["South Korea's exports fell by 122 in February.",
               "USD/JPY Bought by Leveraged Accounts: Trader; Nikkei Rebounds" ,
               "This  is  some       text    with   extra   spaces.  ",
               "PACIFIC BASIN EXTENDS GAINS, UP 16.7%, MOST SINCE DEC. 2008",
               'PACIFIC BASIN EXTENDS GAINS, UP 16.7%, MOST SINCE DEC. 2008',
               'MORE: Taiwan Sells NT$30b of 10-Yr Bonds at Record-Low 0.83%']
test= pd.DataFrame({"headline": news_titles})

In [42]:
ds_test = Dataset.from_dict(test)

Create the model

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example input sentences
translated_sentences = ['This is sentence 1', 'This is sentence 2', ...]  # 400 translated sentences
original_sentences = ['This is sentence 1', 'This is sentence 2', ...]  # 600 original sentences

# Tokenize the sentences and convert them to input IDs
def tokenize_and_encode(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    return inputs.input_ids, inputs.attention_mask

translated_input_ids, translated_attention_mask = tokenize_and_encode(translated_sentences)
original_input_ids, original_attention_mask = tokenize_and_encode(original_sentences)

# Process the sentences in batches
batch_size = 32
translated_embeddings = []
original_embeddings = []
for i in range(0, len(translated_sentences), batch_size):
    input_ids = translated_input_ids[i:i+batch_size]
    attention_mask = translated_attention_mask[i:i+batch_size]
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        translated_embeddings.append(embeddings)

for i in range(0, len(original_sentences), batch_size):
    input_ids = original_input_ids[i:i+batch_size]
    attention_mask = original_attention_mask[i:i+batch_size]
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        original_embeddings.append(embeddings)

# Concatenate the sentence embeddings
translated_embeddings = torch.cat(translated_embeddings, dim=0)
original_embeddings = torch.cat(original_embeddings, dim=0)

# Compute the cosine similarity between the sentence embeddings
similarity_matrix = torch.nn.functional.cosine_similarity(translated_embeddings, original_embeddings)


In [338]:
# Example usage
news_titles = ["南韓2月份出口同比下降12.2%",
               "日本东证指数收复失地上涨0.1% 日经225指数涨0.2%",
               "花旗外汇客户调查:英国公投结果料为留在欧盟",
               "凯基证券:上证综指中期底部料在2,600点"]
test = pd.DataFrame({"headline": news_titles})


In [341]:
test=test.reset_index().rename(columns={'index': 'id'})

In [342]:
#prepare the dataset 
ds = Dataset.from_dict(test)

In [343]:
ds = ds.map(lambda x: compute_length(x, text='headline'), batched=True).sort('length', reverse=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [344]:
ds = ds.map(lambda x: translate(x, tokenizer, model, text='headline'), batched=True, batch_size=batch_size).sort('id')

  0%|          | 0/1 [00:00<?, ?ba/s]

In [345]:
ds.set_format("pandas")

In [346]:
df_test = ds[:]
df_test

Unnamed: 0,id,headline,length,translation
0,0,南韓2月份出口同比下降122,14,South Korea's exports fell by 122 in February.
1,1,日本东证指数收复失地上涨01 日经225指数涨02,25,Japan's Eastern Evidence Index recovered the l...
2,2,花旗外汇客户调查:英国公投结果料为留在欧盟,21,Citigroup Foreign Exchange Client Survey: Brit...
3,3,"凯基证券:上证综指中期底部料在2,600点",21,Caki Securities: The upper certificate refers ...


In [327]:
# Example usage
news_titles = [" 功夫债观察:中国海外,宏洋财务IV(开曼)有限公司领涨",
               "* 日本东证, 指数收复失地上涨0.1% 日经225指数涨0.2%（12）" ,
               "This  is  some       text    with   extra   spaces.  ",
               "在美上市中概股:MultiMetaVerse Holdings Ltd下跌73%",
               ' +1.6% Y/Y: PCA (1)',
               'No. 02-8543 (E.D. Pa. Feb. 29, 2016), Court Opinion']
test= pd.DataFrame({"headline": news_titles})

In [39]:
# Example usage
news_titles = ["南韓2月份出口同比下降12.2%",
               "* 日本东证指数收复失地上涨0.1% 日经225指数涨0.2%",
               "花旗外汇客户调查:英国公投结果料为留在欧盟",
               "凯基证券:上证综指中期底部料在2,600点"]
df = pd.DataFrame({"text": news_titles})
ds = Dataset.from_dict(df)
translated_df = translate(ds, tokenizer, model, text='text')
print(translated_df)

{'translation': ["South Korea's exports fell 12.2 per cent in February.", ':: A 0.1 per cent increase in the recovery of lost ground and a 0.2 per cent increase in the 225-day-long index of Japan', 'Citigroup Foreign Exchange Client Survey: British referendum results expected to remain in the EU', 'Caki Securities: The upper certificate refers to medium-term bottom content at 2,600 p.m.']}


In [36]:
# Example usage
news_titles = ["南韓2月份出口同比下降12.2%",
               "日本东证指数收复失地上涨0.1% 日经225指数涨0.2%",
               "花旗外汇客户调查:英国公投结果料为留在欧盟",
               "凯基证券:上证综指中期底部料在2,600点"]
df = pd.DataFrame({"headline": news_titles})