本日记本用于处理[所有数据CSV文件](../../data/processed/all_steam_and_game_data_before_clean.csv)中各种格式问题

### 读取文件

In [7]:
import pandas as pd
import re
import requests
import json
import ast

In [2]:
raw_df = pd.read_csv('../../data/processed/all_steam_and_game_data_before_clean.csv')

### 清理日期类

In [3]:
raw_df['timecreated'] = pd.to_datetime(raw_df['timecreated'],unit='s')
raw_df['timecreated']

0       2003-09-12 06:39:05
1       2003-09-12 08:15:26
2       2003-09-12 09:25:29
3       2003-09-12 11:36:22
4       2003-09-12 12:42:39
                ...        
20437   2014-09-28 15:36:30
20438   2014-03-18 17:42:21
20439   2019-07-31 20:57:10
20440   2013-09-09 12:24:43
20441   2010-06-13 15:26:40
Name: timecreated, Length: 20442, dtype: datetime64[ns]

### 整理与价格有关

In [4]:
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]
raw_df[price_columns]

Unnamed: 0,price_overview1,price_overview2,price_overview3,price_overview4,price_overview5,price_overview_2weeks_game_1,price_overview_2weeks_game_2,price_overview_2weeks_game_3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,{},"{'currency': 'USD', 'initial': 999, 'final': 9...","{'currency': 'CAD', 'initial': 5349, 'final': ...","{'currency': 'USD', 'initial': 2999, 'final': ...","{'currency': 'USD', 'initial': 2999, 'final': ...",{},{},"{'currency': 'SGD', 'initial': 5400, 'final': ..."
4,,,,,,,,
...,...,...,...,...,...,...,...,...
20437,,,,,,,,
20438,,,,,,{},{},
20439,,,,,,,,
20440,,,,,,"{'currency': 'EUR', 'initial': 6499, 'final': ...","{'currency': 'KRW', 'initial': 4480000, 'final...",{}


In [6]:
raw_df[price_columns].applymap(type)

  raw_df[price_columns].applymap(type)


Unnamed: 0,price_overview1,price_overview2,price_overview3,price_overview4,price_overview5,price_overview_2weeks_game_1,price_overview_2weeks_game_2,price_overview_2weeks_game_3
0,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
1,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
2,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
3,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>
4,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
...,...,...,...,...,...,...,...,...
20437,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
20438,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'str'>,<class 'str'>,<class 'float'>
20439,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>
20440,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'float'>,<class 'str'>,<class 'str'>,<class 'str'>


In [8]:
def get_price(cell):
    # 检查是否是字符串类型
    if isinstance(cell, str):  
        try:
            # 将字符串转换为字典
            cell_dict = ast.literal_eval(cell)    
            if 'final_formatted' in cell_dict:
                return cell_dict['final_formatted'] ,cell_dict['currency']
        except:
            # 如果转换失败或不包含关键字，返回 None
            return None 
    # 对于非字符串类型，返回 None 
    return None  

# 获取包含 'price' 的列名
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]

# 提取价格数据
for col in price_columns:
    raw_df[col] = raw_df[col].apply(get_price)

# 显示提取后的结果
raw_df[price_columns]

Unnamed: 0,price_overview1,price_overview2,price_overview3,price_overview4,price_overview5,price_overview_2weeks_game_1,price_overview_2weeks_game_2,price_overview_2weeks_game_3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,"($9.99, USD)","(CDN$53.49, CAD)","($29.99, USD)","($29.99, USD)",,,"(S$54.00, SGD)"
4,,,,,,,,
...,...,...,...,...,...,...,...,...
20437,,,,,,,,
20438,,,,,,,,
20439,,,,,,,,
20440,,,,,,"(64,99€, EUR)","(₩44,800, KRW)",


In [9]:
raw_df[price_columns].iloc[3,1][0] # '$9.99'

'$9.99'

### 调用工具包获得最新的汇率数据

调用api获得数据

In [115]:
def get_rates(base='USD'):
    """ 获取指定基础货币对于其他货币的汇率 """
    url = f"https://api.exchangerate-api.com/v4/latest/{base}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        rates = data['rates']
        return rates
    else:
        raise Exception(f"Failed to retrieve data: {response.status_code}")

def save_rates_to_json(rates, filename="../../data/external/currency_rates.json"):
    """ 将汇率数据保存到本地JSON文件 """
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(rates, file, ensure_ascii=False, indent=4)

# 获取并打印美元的汇率，同时保存到本地文件
try:
    usd_rates = get_rates('USD')
    print("USD Currency Rates:")
    for currency, rate in usd_rates.items():
        print(f"{currency}: {rate}")
    save_rates_to_json(usd_rates)  # 保存至本地文件
    print("Rates have been saved to currency_rates.json.")
except Exception as e:
    # print(e)
    pass


USD Currency Rates:
USD: 1
AED: 3.67
AFN: 71.88
ALL: 92.74
AMD: 387.97
ANG: 1.79
AOA: 861.61
ARS: 864.75
AUD: 1.51
AWG: 1.79
AZN: 1.7
BAM: 1.81
BBD: 2
BDT: 117.39
BGN: 1.81
BHD: 0.376
BIF: 2866.32
BMD: 1
BND: 1.35
BOB: 6.92
BRL: 5.16
BSD: 1
BTN: 83.37
BWP: 13.56
BYN: 3.25
BZD: 2
CAD: 1.37
CDF: 2776.69
CHF: 0.913
CLP: 897.18
CNY: 7.26
COP: 3839
CRC: 520.19
CUP: 24
CVE: 101.93
CZK: 22.83
DJF: 177.72
DKK: 6.9
DOP: 58.96
DZD: 134.6
EGP: 47.38
ERN: 15
ETB: 57.5
EUR: 0.924
FJD: 2.23
FKP: 0.786
FOK: 6.9
GBP: 0.786
GEL: 2.79
GGP: 0.786
GHS: 14.83
GIP: 0.786
GMD: 66.73
GNF: 8577.23
GTQ: 7.77
GYD: 209.27
HKD: 7.81
HNL: 24.72
HRK: 6.97
HTG: 132.8
HUF: 358.05
IDR: 16192.77
ILS: 3.7
IMP: 0.786
INR: 83.37
IQD: 1310.48
IRR: 42095.95
ISK: 137.41
JEP: 0.786
JMD: 155.59
JOD: 0.709
JPY: 157.49
KES: 132.51
KGS: 88.01
KHR: 4088.62
KID: 1.51
KMF: 454.79
KRW: 1368.05
KWD: 0.307
KYD: 0.833
KZT: 443.35
LAK: 21569.26
LBP: 89500
LKR: 301.43
LRD: 193.8
LSL: 18.41
LYD: 4.84
MAD: 9.95
MDL: 17.7
MGA: 4439.17
MKD: 56

读取汇率数据列表

In [116]:
with open('../../data/external/currency_rates.json','r',encoding='utf-8') as file:
    exchange_rates = json.load(file)
exchange_rates

{'USD': 1,
 'AED': 3.67,
 'AFN': 71.88,
 'ALL': 92.74,
 'AMD': 387.97,
 'ANG': 1.79,
 'AOA': 861.61,
 'ARS': 864.75,
 'AUD': 1.51,
 'AWG': 1.79,
 'AZN': 1.7,
 'BAM': 1.81,
 'BBD': 2,
 'BDT': 117.39,
 'BGN': 1.81,
 'BHD': 0.376,
 'BIF': 2866.32,
 'BMD': 1,
 'BND': 1.35,
 'BOB': 6.92,
 'BRL': 5.16,
 'BSD': 1,
 'BTN': 83.37,
 'BWP': 13.56,
 'BYN': 3.25,
 'BZD': 2,
 'CAD': 1.37,
 'CDF': 2776.69,
 'CHF': 0.913,
 'CLP': 897.18,
 'CNY': 7.26,
 'COP': 3839,
 'CRC': 520.19,
 'CUP': 24,
 'CVE': 101.93,
 'CZK': 22.83,
 'DJF': 177.72,
 'DKK': 6.9,
 'DOP': 58.96,
 'DZD': 134.6,
 'EGP': 47.38,
 'ERN': 15,
 'ETB': 57.5,
 'EUR': 0.924,
 'FJD': 2.23,
 'FKP': 0.786,
 'FOK': 6.9,
 'GBP': 0.786,
 'GEL': 2.79,
 'GGP': 0.786,
 'GHS': 14.83,
 'GIP': 0.786,
 'GMD': 66.73,
 'GNF': 8577.23,
 'GTQ': 7.77,
 'GYD': 209.27,
 'HKD': 7.81,
 'HNL': 24.72,
 'HRK': 6.97,
 'HTG': 132.8,
 'HUF': 358.05,
 'IDR': 16192.77,
 'ILS': 3.7,
 'IMP': 0.786,
 'INR': 83.37,
 'IQD': 1310.48,
 'IRR': 42095.95,
 'ISK': 137.41,
 'JEP': 

### 根据字典将price数据全更换为USD

首先检查都有哪些货币类型

In [117]:
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]
currency_type_list = raw_df[price_columns].applymap(lambda x:x[1] if x and len(x)>1 else None).stack().reset_index(drop=True).drop_duplicates().tolist()
currency_type_list

  currency_type_list = raw_df[price_columns].applymap(lambda x:x[1] if x and len(x)>1 else None).stack().reset_index(drop=True).drop_duplicates().tolist()


['USD',
 'CAD',
 'SGD',
 'INR',
 'THB',
 'KRW',
 'HKD',
 'PHP',
 'SAR',
 'CNY',
 'EUR',
 'GBP',
 'ILS',
 'RUB']

In [118]:
'''根据list的值，转换涉及金额的列'''

import re
import pandas as pd

def convert_to_usd(price_tuple):
    if pd.isna(price_tuple):
        return None
    try:
        # 分别提取价格和货币单位
        price = price_tuple[0]
        currency = price_tuple[1]

        # 仅使用正则表达式来提取数值，因为货币单位已经明确提供
        matches = re.search(r'([\$€£₹฿₩₱₪₽¥]?)(\d{1,3}(?:,\d{3})*|\d+)(\.?\d*)', price)
        if not matches:
            return None
        value_str = matches.group(2).replace(',', '')  # 去除数字中的逗号
        value = float(value_str)

        # 转换为 USD
        if currency in exchange_rates:
            return value / exchange_rates[currency]
    except Exception as e:
        print(f"Error converting {price_tuple}: {e}")
        return None


### 整理数据类型

清理id为空的行，清理重复行

In [119]:
raw_df[price_columns] = raw_df[price_columns].applymap(convert_to_usd)
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.dropna(subset='steamid')

  raw_df[price_columns] = raw_df[price_columns].applymap(convert_to_usd)


转换数据类型

In [120]:
# 首先观察所有列的数据类型
raw_df.dtypes.to_dict()

{'steamid': dtype('float64'),
 'communityvisibilitystate': dtype('float64'),
 'profilestate': dtype('float64'),
 'personaname': dtype('O'),
 'profileurl': dtype('O'),
 'avatar': dtype('O'),
 'avatarmedium': dtype('O'),
 'avatarfull': dtype('O'),
 'avatarhash': dtype('O'),
 'personastate': dtype('float64'),
 'primaryclanid': dtype('float64'),
 'timecreated': dtype('<M8[ns]'),
 'personastateflags': dtype('float64'),
 'loccountrycode': dtype('O'),
 'game_count': dtype('float64'),
 'appid_game1': dtype('float64'),
 'name_game1': dtype('O'),
 'playtime_forever_game1': dtype('float64'),
 'appid_game2': dtype('float64'),
 'name_game2': dtype('O'),
 'playtime_forever_game2': dtype('float64'),
 'appid_game3': dtype('float64'),
 'name_game3': dtype('O'),
 'playtime_forever_game3': dtype('float64'),
 'appid_game4': dtype('float64'),
 'name_game4': dtype('O'),
 'playtime_forever_game4': dtype('float64'),
 'appid_game5': dtype('float64'),
 'name_game5': dtype('O'),
 'playtime_forever_game5': dtype(

转换需要调整的数据类型,此处将所有空值转换为0，之后注意filter为0的数据

In [121]:
for i in range(len(raw_df.columns)):
    col = raw_df.columns[i]
    if raw_df[col].dtype == 'float64':
        raw_df[col] = raw_df[col].fillna(0).astype('int64')
    elif raw_df[col].dtype == 'object':        
        raw_df[col] = raw_df[col].fillna('Unknown').astype('str')
    else:
        pass

### 整理列排序

In [122]:
'''便捷调整工具1'''
for i in range(1,6):
    print(f"'appid_game{i}', 'name_game{i}', 'playtime_forever_game{i}', 'price_overview{i}', 'genres{i}', 'developers{i}', 'publishers{i}', 'categories{i}', 'release_date{i}', 'metacritic{i}',")



'appid_game1', 'name_game1', 'playtime_forever_game1', 'price_overview1', 'genres1', 'developers1', 'publishers1', 'categories1', 'release_date1', 'metacritic1',
'appid_game2', 'name_game2', 'playtime_forever_game2', 'price_overview2', 'genres2', 'developers2', 'publishers2', 'categories2', 'release_date2', 'metacritic2',
'appid_game3', 'name_game3', 'playtime_forever_game3', 'price_overview3', 'genres3', 'developers3', 'publishers3', 'categories3', 'release_date3', 'metacritic3',
'appid_game4', 'name_game4', 'playtime_forever_game4', 'price_overview4', 'genres4', 'developers4', 'publishers4', 'categories4', 'release_date4', 'metacritic4',
'appid_game5', 'name_game5', 'playtime_forever_game5', 'price_overview5', 'genres5', 'developers5', 'publishers5', 'categories5', 'release_date5', 'metacritic5',


In [123]:
'''便捷调整工具2'''
for i in range(1,4):
    print(f"'appid_game{i}_2weeks', 'name_game{i}_2weeks', 'playtime_2weeks_game{i}', 'playtime_forever_game{i}_2weeks', 'steam_appid_2weeks_game_{i}', 'price_overview_2weeks_game_{i}', 'genres_2weeks_game_{i}', 'developers_2weeks_game_{i}', 'publishers_2weeks_game_{i}', 'categories_2weeks_game_{i}', 'release_date_2weeks_game_{i}', 'metacritic_2weeks_game_{i}',")

'appid_game1_2weeks', 'name_game1_2weeks', 'playtime_2weeks_game1', 'playtime_forever_game1_2weeks', 'steam_appid_2weeks_game_1', 'price_overview_2weeks_game_1', 'genres_2weeks_game_1', 'developers_2weeks_game_1', 'publishers_2weeks_game_1', 'categories_2weeks_game_1', 'release_date_2weeks_game_1', 'metacritic_2weeks_game_1',
'appid_game2_2weeks', 'name_game2_2weeks', 'playtime_2weeks_game2', 'playtime_forever_game2_2weeks', 'steam_appid_2weeks_game_2', 'price_overview_2weeks_game_2', 'genres_2weeks_game_2', 'developers_2weeks_game_2', 'publishers_2weeks_game_2', 'categories_2weeks_game_2', 'release_date_2weeks_game_2', 'metacritic_2weeks_game_2',
'appid_game3_2weeks', 'name_game3_2weeks', 'playtime_2weeks_game3', 'playtime_forever_game3_2weeks', 'steam_appid_2weeks_game_3', 'price_overview_2weeks_game_3', 'genres_2weeks_game_3', 'developers_2weeks_game_3', 'publishers_2weeks_game_3', 'categories_2weeks_game_3', 'release_date_2weeks_game_3', 'metacritic_2weeks_game_3',


In [124]:
cleaned_df = raw_df[[
       'steamid', 'communityvisibilitystate', 'profilestate', 'personaname',
       'profileurl', 'avatar', 'avatarmedium', 'avatarfull', 'avatarhash',
       'personastate', 'primaryclanid', 'timecreated', 'personastateflags',
       'loccountrycode', 'game_count', 

       'appid_game1', 'name_game1', 'playtime_forever_game1', 'price_overview1', 'genres1', 'developers1', 'publishers1', 'categories1', 'release_date1', 'metacritic1',
       'appid_game2', 'name_game2', 'playtime_forever_game2', 'price_overview2', 'genres2', 'developers2', 'publishers2', 'categories2', 'release_date2', 'metacritic2',
       'appid_game3', 'name_game3', 'playtime_forever_game3', 'price_overview3', 'genres3', 'developers3', 'publishers3', 'categories3', 'release_date3', 'metacritic3',
       'appid_game4', 'name_game4', 'playtime_forever_game4', 'price_overview4', 'genres4', 'developers4', 'publishers4', 'categories4', 'release_date4', 'metacritic4',
       'appid_game5', 'name_game5', 'playtime_forever_game5', 'price_overview5', 'genres5', 'developers5', 'publishers5', 'categories5', 'release_date5', 'metacritic5',

       'appid_game1_2weeks', 'name_game1_2weeks', 'playtime_2weeks_game1', 'playtime_forever_game1_2weeks', 'steam_appid_2weeks_game_1', 'price_overview_2weeks_game_1', 'genres_2weeks_game_1', 'developers_2weeks_game_1', 'publishers_2weeks_game_1', 'categories_2weeks_game_1', 'release_date_2weeks_game_1', 'metacritic_2weeks_game_1',
       'appid_game2_2weeks', 'name_game2_2weeks', 'playtime_2weeks_game2', 'playtime_forever_game2_2weeks', 'steam_appid_2weeks_game_2', 'price_overview_2weeks_game_2', 'genres_2weeks_game_2', 'developers_2weeks_game_2', 'publishers_2weeks_game_2', 'categories_2weeks_game_2', 'release_date_2weeks_game_2', 'metacritic_2weeks_game_2',
       'appid_game3_2weeks', 'name_game3_2weeks', 'playtime_2weeks_game3', 'playtime_forever_game3_2weeks', 'steam_appid_2weeks_game_3', 'price_overview_2weeks_game_3', 'genres_2weeks_game_3', 'developers_2weeks_game_3', 'publishers_2weeks_game_3', 'categories_2weeks_game_3', 'release_date_2weeks_game_3', 'metacritic_2weeks_game_3',               
       ]]

In [125]:
cleaned_df.to_csv('../../data/processed/all_steam_and_game_data_after_cleaned.csv')