尝试提取全部的数据

In [None]:
import os #to access files
import pandas as pd #to work with dataframes
import numpy as np #just a tradition
from sklearn.model_selection import StratifiedKFold #for cross-validation
from sklearn.metrics import roc_auc_score #this is we are trying to increase
import matplotlib.pyplot as plt #we will plot something at the end
import seaborn as sns #same reason
import lightgbm as lgb #the model we gonna use

import json
import csv
import os

# 确保输出路径存在
PATH_TO_DATA = '../data/'
output_dir = os.path.join(PATH_TO_DATA, 'data_extract_new')
os.makedirs(output_dir, exist_ok=True)  # 如果目录不存在，创建它

print('1.一级标签提取——main table')
def extract_non_nested_and_count_nested_keys(json_object):
    """
    从JSON对象中提取所有不含嵌套属性的一级属性，
    并对包含嵌套属性的一级属性统计其内部的数据数量。
    """
    data = {}
    for key, value in json_object.items():
        # 如果属性不含嵌套（即不是dict或list），直接提取
        if not isinstance(value, (dict, list)):
            data[key] = value
        # 如果属性是列表，统计其长度并添加统计列
        elif isinstance(value, list):
            data[f"{key}_number"] = len(value)
        # 如果属性是字典类型，也记录其包含的键值数量
        elif isinstance(value, dict):
            data[f"{key}_number"] = len(value)
    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """将JSONL文件转换为CSV文件，提取不含嵌套的一级属性，并统计嵌套属性的数量"""
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取不含嵌套的一级属性和嵌套属性的数量
            processed_data = extract_non_nested_and_count_nested_keys(json_object)
            
            # 删除不需要的列
            for col_to_remove in ['objectives_number', 'players_number', 'targets_number']:
                processed_data.pop(col_to_remove, None)
            
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/main_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('1.一级标签提取——main table提取成功')

import json
import csv
import os
print('2.提取objectives表格')
def find_max_objectives(jsonl_file_path):
    """
    遍历JSONL文件，找出所有行中objectives的最大数量。
    """
    max_objectives = 0
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            objectives_count = len(json_object.get("objectives", []))
            if objectives_count > max_objectives:
                max_objectives = objectives_count
    print(max_objectives)
    return max_objectives

def extract_objectives(json_object, max_objectives):
    """
    从JSON对象中提取所有objectives数据，将每个objective展开成单独的列。
    如果没有那么多objectives，则保留空值。
    """
    data = {}
    objectives = json_object.get("objectives", [])
    # 仅保留type、key、slot字段
    keys_to_include = ["type", "key", "slot"]
    for i in range(1, max_objectives + 1):
        if i <= len(objectives):
            objective = objectives[i - 1]
            for key in keys_to_include:
                column_name = f"objective-{i}-{key}"
                data[column_name] = objective.get(key, "")
        else:
            for key in keys_to_include:
                column_name = f"objective-{i}-{key}"
                data[column_name] = ""
    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有objective数据。
    """
    # 第一次遍历：找到最大的objectives数量
    max_objectives = find_max_objectives(jsonl_file_path)
    
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取并展开objectives数据
            processed_data = extract_objectives(json_object, max_objectives=max_objectives)
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/objective_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('2.提取objectives表格提取成功')

import json
import csv
import os
print('3.提取targets表格')
def extract_radiant_win(json_object):
    """
    从JSON对象中提取radiant_win属性。
    """
    data = {}
    targets = json_object.get("targets", {})
    # 仅提取radiant_win字段
    data["radiant_win"] = targets.get("radiant_win", "")
    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，仅提取radiant_win字段。
    """
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取radiant_win字段
            processed_data = extract_radiant_win(json_object)
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/target_table_radiantwin.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('3.targets表格提取成功')

import json
import csv
import os
print('4.提取teamfights表格')
def extract_teamfights(json_object, max_players):
    """
    从JSON对象中提取teamfights数据，将每个teamfight和其下的player属性展开成单独的列。
    删除players中的ability_uses, deaths, deaths_pos, item_uses, 和killed字段。
    """
    data = {}
    teamfights = json_object.get("teamfights", [])

    for i, teamfight in enumerate(teamfights, start=1):
        # 提取teamfight的顶层字段
        data[f"teamfights-{i}-end"] = teamfight.get("end", "")
        data[f"teamfights-{i}-start"] = teamfight.get("start", "")
        data[f"teamfights-{i}-deaths"] = teamfight.get("deaths", "")
        data[f"teamfights-{i}-last_death"] = teamfight.get("last_death", "")

        # 提取每个player的数据
        players = teamfight.get("players", [])
        for j, player in enumerate(players, start=1):
            # 删除不需要的字段，只保留以下字段
            data[f"teamfights-{i}-player-{j}-xp_delta"] = player.get("xp_delta", "")
            data[f"teamfights-{i}-player-{j}-damage"] = player.get("damage", "")
            data[f"teamfights-{i}-player-{j}-gold_delta"] = player.get("gold_delta", "")
            data[f"teamfights-{i}-player-{j}-healing"] = player.get("healing", "")
            data[f"teamfights-{i}-player-{j}-buybacks"] = player.get("buybacks", "")

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有teamfights数据。
    """
    all_fieldnames = set()
    max_players = 0
    
    # 第一次遍历：确定最大玩家数量和完整字段名
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            teamfights = json_object.get("teamfights", [])
            max_players = max(max_players, max(len(tf.get("players", [])) for tf in teamfights) if teamfights else 0)
            processed_data = extract_teamfights(json_object, max_players)
            all_fieldnames.update(processed_data.keys())
    
    # 排序字段名，确保一致性
    all_fieldnames = sorted(all_fieldnames)
    
    # 第二次遍历：将数据写入CSV文件
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=all_fieldnames)
        csv_writer.writeheader()
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_teamfights(json_object, max_players)
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/teamfights_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('4.teamfights表格提取成功')

import json
import csv
import os
print('5.提取players表格')
def flatten_dict(data, prefix=""):
    """
    仅递归展开嵌套字典中需要展开的字段，避免过度拉平，保留指定字段为普通字段。
    如果字段是字典，则展开为`prefix-key`的形式，如果是其他类型，则直接保存为值。
    """
    flat_data = {}
    for key, value in data.items():
        new_key = f"{prefix}-{key}" if prefix else key
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                flat_data[f"{new_key}-{sub_key}"] = sub_value
        else:
            flat_data[new_key] = value
    return flat_data

def extract_players(json_object):
    """
    从JSON对象中提取players数据，将每个player和其下的多级属性展开成单独的列。
    删除指定字段，保留需要展开的字段，并处理多级嵌套。
    """
    data = {}
    players = json_object.get("players", [])

    for i, player in enumerate(players, start=1):
        player_data = {}

        # 仅保留需要的字段
        required_fields = [
            "sen_placed", "sen_left_log", "kills", "obs_left_log", 
            "max_hero_hit", "obs_log", "max_mana", "creeps_stacked", 
            "xp_reasons", "randomed", "towers_killed", "health", 
            "rune_pickups", "level", "stuns", "deaths", "gold",
            "nearby_creep_death_count", "denies", "observers_placed", 
            "sen_log", "hero_id", "max_health"
        ]
        
        for field in required_fields:
            if field in player:
                # 对 max_hero_hit 特殊处理，只保留 value 字段
                if field == "max_hero_hit" and isinstance(player[field], dict):
                    player_data[field] = player[field].get("value", "")
                else:
                    player_data[field] = player[field]

        # 展开需要的嵌套字典
        flat_player_data = flatten_dict(player_data, prefix=f"players-{i}")
        data.update(flat_player_data)

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有players数据。
    """
    all_fieldnames = set()
    
    # 第一次遍历：确定完整的字段名
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_players(json_object)
            all_fieldnames.update(processed_data.keys())
    
    # 排序字段名，确保一致性
    all_fieldnames = sorted(all_fieldnames)
    
    # 第二次遍历：将数据写入CSV文件
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=all_fieldnames)
        csv_writer.writeheader()
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_players(json_object)
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/players_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('5.players表格提取成功')

  import pandas.util.testing as tm


1.一级标签提取——main table
1.一级标签提取——main table提取成功
2.提取objectives表格
43
2.提取objectives表格提取成功
3.提取targets表格
3.targets表格提取成功
4.提取teamfights表格
4.teamfights表格提取成功
5.提取players表格
5.players表格提取成功


: 

迭代，需要留player_slot

In [1]:
import os #to access files
import pandas as pd #to work with dataframes
import numpy as np #just a tradition
from sklearn.model_selection import StratifiedKFold #for cross-validation
from sklearn.metrics import roc_auc_score #this is we are trying to increase
import matplotlib.pyplot as plt #we will plot something at the end
import seaborn as sns #same reason
import lightgbm as lgb #the model we gonna use

import json
import csv
import os

# 确保输出路径存在
PATH_TO_DATA = '../data/'
output_dir = os.path.join(PATH_TO_DATA, 'data_extract_new')
os.makedirs(output_dir, exist_ok=True)  # 如果目录不存在，创建它

import json
import csv
import os
print('2.提取objectives表格')
def find_max_objectives(jsonl_file_path):
    """
    遍历JSONL文件，找出所有行中objectives的最大数量。
    """
    max_objectives = 0
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            objectives_count = len(json_object.get("objectives", []))
            if objectives_count > max_objectives:
                max_objectives = objectives_count
    print(max_objectives)
    return max_objectives

def extract_objectives(json_object, max_objectives):
    """
    从JSON对象中提取所有objectives数据，将每个objective展开成单独的列。
    如果没有那么多objectives，则保留空值。
    """
    data = {}
    objectives = json_object.get("objectives", [])
    # 仅保留type、key、slot字段
    keys_to_include = ["type", "player_slot","key", "slot"]
    for i in range(1, max_objectives + 1):
        if i <= len(objectives):
            objective = objectives[i - 1]
            for key in keys_to_include:
                column_name = f"objective-{i}-{key}"
                data[column_name] = objective.get(key, "")
        else:
            for key in keys_to_include:
                column_name = f"objective-{i}-{key}"
                data[column_name] = ""
    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有objective数据。
    """
    # 第一次遍历：找到最大的objectives数量
    max_objectives = find_max_objectives(jsonl_file_path)
    
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取并展开objectives数据
            processed_data = extract_objectives(json_object, max_objectives=max_objectives)
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

# 使用
PATH_TO_DATA = '../data/'
jsonl_file_path = r"E:\同济本科作业相关\DataMining\Lab_Dota\data\train_matches.jsonl"
csv_file_path = os.path.join(PATH_TO_DATA, 'data_extract_new/objective_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('2.提取objectives表格提取成功')


  import pandas.util.testing as tm


2.提取objectives表格
43
2.提取objectives表格提取成功
