This code is a test of the bert-base-chinese based text vectorization similarity method, which consists of two main parts: short text to long text matching and short inter-text matching using real data.

In [None]:
import json
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# loading Data
data = pd.read_excel('/content/drive/MyDrive/DSAA5002PROJECT/Task1_data/Be_filtered.xlsx')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
news_data = data

#short text to long text matching
The dataset used in this section is the remaining dataset after I filtered it using only violence. It has been deposited in the Data folder.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').to(device)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [None]:
import re

In [None]:
with open('/content/drive/MyDrive/DSAA5002PROJECT/dsaa5002project/dsaa5002_project/A_share_list.json', 'r') as f:
    stock_list = json.load(f)

stock_names = [stock['name'] for stock in stock_list]
stock_variants = set(stock_names)

stock_names = [re.escape(name) for name in stock_names]


stock_name_embeddings = {}
for stock_name in stock_names:
    inputs = tokenizer(stock_name, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    stock_name_embeddings[stock_name] = outputs.last_hidden_state.mean(1).cpu().detach().numpy()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def match_stock_name(row, encode_method='mean'):
    title = str(row['Title']) if isinstance(row['Title'], str) else ""
    content = str(row['NewsContent']) if isinstance(row['NewsContent'], str) else ""

    title_inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    content_inputs = tokenizer(content, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        title_outputs = model(**title_inputs)
        content_outputs = model(**content_inputs)

    if encode_method == 'mean':
        title_embedding = title_outputs.last_hidden_state.mean(1).cpu().numpy()
        content_embedding = content_outputs.last_hidden_state.mean(1).cpu().numpy()
    elif encode_method == 'max':
        title_embedding = title_outputs.last_hidden_state.max(1).values.cpu().numpy()
        content_embedding = content_outputs.last_hidden_state.max(1).values.cpu().numpy()
    else:
        raise ValueError(f"Unsupported encode method: {encode_method}")

    matched_stock_names = []

    # Check title similarity
    for stock_name, stock_embedding in stock_name_embeddings.items():
        similarity = cosine_similarity(title_embedding, stock_embedding)
        if similarity[0][0] > 0.:
          matched_stock_names.append(stock_name)


    for stock_name, stock_embedding in stock_name_embeddings.items():
        similarity = cosine_similarity(content_embedding, stock_embedding)
        if similarity[0][0] > 0.8:
          matched_stock_names.append(stock_name)

    # Return a comma-separated string of matched stock names or None if no match found
    return ','.join(matched_stock_names) if matched_stock_names else ''


In [None]:
news_data = data[:10]

In [None]:
news_data['Explicit_Company'] = news_data.apply(lambda row: match_stock_name(row, encode_method='mean'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_data['Explicit_Company'] = news_data.apply(lambda row: match_stock_name(row, encode_method='mean'), axis=1)


In [None]:
news_data = news_data[news_data['Explicit_Company'].apply(lambda x: len(x) > 0)]

In [None]:
news_data

Unnamed: 0.1,Unnamed: 0,NewsID,Title,NewsContent,NewsSource,Explicit_Company


In [None]:
news_data['NewsContent'][7]

'\u3000\u3000本报讯 易方达平稳增长基金今日发布大比例分红公告，每10份基金份额分配7.8元，权益登记日、除息日为12月 5日，红利发放日为12 月6日。该基金成为目前国内分红总量最多的开放式基金。  \u3000\u3000易方达同时公告将从12月6日起开展该基金的费率优惠、限量促销活动。（周文亮）'

#Part2 short inter-text matching
This code segment mainly detects the similarity calculation relationship between the variant (ORG) and the stock name.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert two stock names to tokens
stock_name_1 = "建设银行"
stock_name_2 = "建行"

# Use tokenizer to encode the two names
inputs_1 = tokenizer(stock_name_1, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
inputs_2 = tokenizer(stock_name_2, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

# Calculate embeddings using the BERT model
with torch.no_grad():
    outputs_1 = model(**inputs_1)
    outputs_2 = model(**inputs_2)

# Calculate the average embedding for each name
embedding_1 = outputs_1.last_hidden_state[:,0,:].cpu().numpy()
embedding_2 = outputs_2.last_hidden_state[:,0,:].cpu().numpy()

# Calculate the cosine similarity of two embedding
similarity = cosine_similarity(embedding_1, embedding_2)

print(f"The cosine similarity between them is: {similarity[0][0]}")

The cosine similarity between them is: 0.8775085806846619
