## Read Files

In [41]:
import pandas as pd

# 讀取訓練數據
train_df = pd.read_csv("train.csv")

# 讀取測試數據
test_df = pd.read_csv("test.csv")

## Create Graph

In [42]:
import networkx as nx

G = nx.from_pandas_edgelist(
    train_df[train_df["label"] == 1], "node1", target="node2", create_using=nx.DiGraph()
)

print(G)

def count_successors(G, node):
    if G.has_node(node):
        return len(list(G.successors(node)))
    return 0


def count_predecessors(G, node):
    if G.has_node(node):
        return len(list(G.predecessors(node)))
    return 0


# Apply the metrics to your dataframe
train_df["successor_count"] = train_df["node1"].apply(lambda x: count_successors(G, x))
train_df["predecessor_count"] = train_df["node2"].apply(
    lambda x: count_predecessors(G, x)
)

train_df

DiGraph with 10625 nodes and 12765 edges


Unnamed: 0,node1,node2,label,successor_count,predecessor_count
0,29237,16563,1,1,1
1,32869,24548,0,0,0
2,48837,17831,1,2,1
3,31387,44509,0,0,0
4,15102,10271,1,2,1
...,...,...,...,...,...
31995,51510,44952,1,1,1
31996,30089,48199,0,0,0
31997,1418,13815,0,0,0
31998,5451,24600,1,9,9


In [43]:
# Apply the metrics to the test dataframe
test_df["successor_count"] = test_df["node1"].apply(lambda x: count_successors(G, x))
test_df["predecessor_count"] = test_df["node2"].apply(
    lambda x: count_predecessors(G, x)
)

test_df = test_df.drop(columns=["idx"], axis=1)
test_df

Unnamed: 0,node1,node2,successor_count,predecessor_count
0,5416,45023,18,8
1,6681,40749,0,0
2,44162,40953,0,1
3,51387,51233,4,13
4,6498,24093,2,1
...,...,...,...,...
7995,15213,5972,0,0
7996,30870,8448,0,0
7997,5409,49118,0,0
7998,14298,45426,0,0


## XGBoost

In [44]:
from xgboost import XGBClassifier

# 建立 XGBClassifier 模型
model = XGBClassifier(
    n_estimators=10,
    max_depth=3,
    n_jobs=-1,
)

# 使用訓練資料訓練模型
model.fit(train_df.drop(["label"], axis=1), train_df["label"])
# 使用訓練資料預測分類
predictions = model.predict(test_df)

## Create Result CSV

In [45]:
import os

file_path = "submission.csv"

# 檢查文件是否存在
if os.path.exists(file_path):
    os.remove(file_path)
    
# 創建包含 idx 和 predictions 的 DataFrame
df = pd.DataFrame({"idx": range(len(predictions)), "ans": predictions})

# 寫入 CSV 文件
df.to_csv("submission.csv", index=False)