In [12]:
import pandas as pd
from py2neo import Graph

# Read flit_news.xlsx and A_share_list.json files
new_df = pd.read_excel('H:/5002project/Task1-answer/Task1.xlsx')
with open('E:/HKUST(GZ)/DSAA5002/project/dsaa5002project/dsaa5002_project/A_share_list.json', 'r', encoding='utf-8') as file:
    a_share_list = pd.read_json(file)

In [15]:
news_df = new_df[:]
print(f'The total number of data rows is {len(news_df)}.')
# 1. Extract the contents of the Explicit_Company column and split it into lists
def split_companies(explicit_company):
    return explicit_company.split(',')

# 2. Match company name to stock code from A_share_list.json and process code
def match_codes(company_names, a_share_list):
    codes = []
    for name in company_names:
        matched = a_share_list[a_share_list['name'] == name]
        if not matched.empty:
            code = matched['code'].values[0]
            code = ''.join(filter(str.isdigit, code))
            codes.append(code)
    return codes

# 3. Match the processed code with the code in hidy.nodes.company.csv to get the corresponding ID
def get_company_ids(codes, graph):
    ids = []
    for code in codes:
        query = "MATCH (c:Company) WHERE c.code CONTAINS '{code}' RETURN c.id".format(code=code)
        result = graph.run(query).data()
        if result:
            ids.extend([record['c.id'] for record in result])
    return ids

# 4. Use the obtained IDs to fetch relationships from the Neo4j Knowledge Graph
def get_relationships(company_ids, graph):
    relationships = []
    for company_id in company_ids:
        query = """
        MATCH (start:Company)-[r]-(end:Company)
        WHERE start.id = '{id}'
        RETURN end.name AS company_name, type(r) AS relationship_type
        """.format(id=company_id)
        relationships.extend(graph.run(query).data())
    return relationships

# 5. Categorize company names based on the relationship type and the value of the label column in flit_news.xlsx
def classify_companies(relationships, label):
    positive_companies = []
    negative_companies = []
    for relationship in relationships:
        if relationship['relationship_type'] in ['COOPERATE', 'INVEST', 'SAME_INDUSTRY', 'SUPPLY']:
            if label == 1:
                positive_companies.append(relationship['company_name'])
            elif label == 0:
                negative_companies.append(relationship['company_name'])
        elif relationship['relationship_type'] in ['COMPETE', 'DISPUTE']:
            if label == 0:
                positive_companies.append(relationship['company_name'])
            elif label == 1:
                negative_companies.append(relationship['company_name'])
    positive_companies = list(set(positive_companies))
    negative_companies = list(set(negative_companies))
    return positive_companies, negative_companies

# 6. Combine the results into a new DataFrame
def process_row(row, a_share_list, graph):
    company_names = split_companies(row['Explicit_Company'])
    codes = match_codes(company_names, a_share_list)
    company_ids = get_company_ids(codes, graph)
    relationships = get_relationships(company_ids, graph)
    positive_companies, negative_companies = classify_companies(relationships, row['label'])
    return pd.Series({
        'NewsID': row['NewsID'],
        'NewsContent': row['NewsContent'],
        'Explicit_Company': row['Explicit_Company'],
        'label': row['label'],
        'Implicit_Positive_Company': ','.join(positive_companies),
        'Implicit_Negative_Company': ','.join(negative_companies)
    })

# Connect to the Neo4j Database
graph = Graph("bolt://localhost:7687", user="neo4j", password="asd123456")

# Use the apply function to process each line
results_df = news_df.apply(lambda row: process_row(row, a_share_list, graph), axis=1)

# Output the final DataFrame
results_df.head()

The total number of data rows is 527186.


Unnamed: 0,NewsID,NewsContent,Explicit_Company,label,Implicit_Positive_Company,Implicit_Negative_Company
0,1,本报记者 田雨 李京华 中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...,建设银行,0,"捷捷微电,任子行","佳都科技,怡亚通,泰达股份,特发信息,证通电子,农业银行,中国银行,移为通信,来伊份,航天信..."
1,2,中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生日前在信用卡中心揭牌仪式上...,农业银行,1,"招商蛇口,中国神华,建设银行,中国银行,中装建设,中国长城,金地集团,水晶光电,新北洋,招商...",ST云维
2,3,在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...,"中国国航,外运发展",1,"中国石化,寒武纪,春秋航空,中国外运,南方航空,中国电建,兴业证券,农业银行,中国交建,吉祥...",
3,4,胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...,胜利股份,1,"特锐德,新疆浩源",
4,5,全景网11月30日讯 外围股市造好，带动港股今早造好，恒指高开后反覆上升，最高升252点...,"中国银行,建设银行,工商银行,中国太保,交通银行,中国人寿,招商银行",1,"万科A,佳都科技,通威股份,华能国际,泰达股份,特发信息,新华网,中国平安,招商银行,深圳能...","山东黄金,中国银行,数码科技,泰禾集团,中国平安,广州发展,招商银行,华发股份,ST中新,立..."


In [16]:
results_df.to_excel('H:/5002project/Task2-answer/Task2.xlsx', index=False, encoding='utf-8')

  force_unicode(url))
  force_unicode(url))


In [19]:
results_df

Unnamed: 0,NewsID,NewsContent,Explicit_Company,label,Implicit_Positive_Company,Implicit_Negative_Company
0,1,本报记者 田雨 李京华 中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...,建设银行,0,"捷捷微电,任子行","佳都科技,怡亚通,泰达股份,特发信息,证通电子,农业银行,中国银行,移为通信,来伊份,航天信..."
1,2,中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生日前在信用卡中心揭牌仪式上...,农业银行,1,"招商蛇口,中国神华,建设银行,中国银行,中装建设,中国长城,金地集团,水晶光电,新北洋,招商...",ST云维
2,3,在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...,"中国国航,外运发展",1,"中国石化,寒武纪,春秋航空,中国外运,南方航空,中国电建,兴业证券,农业银行,中国交建,吉祥...",
3,4,胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...,胜利股份,1,"特锐德,新疆浩源",
4,5,全景网11月30日讯 外围股市造好，带动港股今早造好，恒指高开后反覆上升，最高升252点...,"中国银行,建设银行,工商银行,中国太保,交通银行,中国人寿,招商银行",1,"万科A,佳都科技,通威股份,华能国际,泰达股份,特发信息,新华网,中国平安,招商银行,深圳能...","山东黄金,中国银行,数码科技,泰禾集团,中国平安,广州发展,招商银行,华发股份,ST中新,立..."
...,...,...,...,...,...,...
527181,1037031,每经AI快讯，有投资者在投资者互动平台提问：请问公司目前有没有电解槽产能，规划情况能否详细介...,亿华通,0,,"百奥泰,中国船舶,飞龙股份,东风汽车,仕佳光子,东旭光电,宝泰隆,福田汽车"
527182,1037032,依米康（SZ 300249，收盘价：10.38元）发布公告称，2023年10月12日，依米康...,"中泰证券,依米康",1,"温氏股份,国金证券,中国银行,兴业证券,东方证券,史丹利,金融街,东吴证券,中国平安,龙磁科...","西水股份,华谊嘉信"
527183,1037033,天风证券10月13日发布研报称，给予中核科技（000777.SZ，最新价：13.03元）买入...,"中核科技,天风证券",1,"迈瑞医疗,兴业证券,长城证券,闻泰科技,凯撒文化,国信证券,东吴证券,格力电器,招商银行,比...","吉翔股份,三特索道,中源家居"
527184,1037034,有投资者提问：抗癌药CPT获批后，公司是否应该按照股权协议继续收购沙东股权，适应症为MM的C...,海特生物,1,"药明康德,海尔生物",
