本日记本用于处理[所有数据CSV文件](../../data/processed/all_steam_and_game_data_before_clean.csv)中各种格式问题

### 读取文件

In [95]:
import pandas as pd
import re
import requests
import json

In [96]:
raw_df = pd.read_csv('../../data/processed/all_steam_and_game_data_before_clean.csv')

### 清理日期类

In [97]:
raw_df['timecreated'] = pd.to_datetime(raw_df['timecreated'],unit='s')
raw_df['timecreated']

0       2003-09-12 06:39:05
1       2003-09-12 08:15:26
2       2003-09-12 09:25:29
3       2003-09-12 11:36:22
4       2003-09-12 12:42:39
                ...        
20437   2014-09-28 15:36:30
20438   2014-03-18 17:42:21
20439   2019-07-31 20:57:10
20440   2013-09-09 12:24:43
20441   2010-06-13 15:26:40
Name: timecreated, Length: 20442, dtype: datetime64[ns]

### 整理与价格有关

In [98]:
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]
raw_df[price_columns]

Unnamed: 0,price_overview1,price_overview2,price_overview3,price_overview4,price_overview5,price_overview_2weeks_game_1,price_overview_2weeks_game_2,price_overview_2weeks_game_3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,{},"{'currency': 'USD', 'initial': 999, 'final': 9...","{'currency': 'CAD', 'initial': 5349, 'final': ...","{'currency': 'USD', 'initial': 2999, 'final': ...","{'currency': 'USD', 'initial': 2999, 'final': ...",{},{},"{'currency': 'SGD', 'initial': 5400, 'final': ..."
4,,,,,,,,
...,...,...,...,...,...,...,...,...
20437,,,,,,,,
20438,,,,,,{},{},
20439,,,,,,,,
20440,,,,,,"{'currency': 'EUR', 'initial': 6499, 'final': ...","{'currency': 'KRW', 'initial': 4480000, 'final...",{}


In [99]:
# 假设 raw_df 是你的 DataFrame

def get_price(cell):
    if isinstance(cell, str):  # 检查是否是字符串类型
        try:
            cell_dict = eval(cell)  # 将字符串转换为字典
            if 'final_formatted' in cell_dict:
                return cell_dict['final_formatted'] ,cell_dict['currency']
        except:
            return None  # 如果转换失败或不包含关键字，返回 None
    return None  # 对于非字符串类型，返回 None

# 获取包含 'price' 的列名
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]

# 提取价格数据
for col in price_columns:
    raw_df[col] = raw_df[col].apply(get_price)

# 显示提取后的结果
raw_df[price_columns]

Unnamed: 0,price_overview1,price_overview2,price_overview3,price_overview4,price_overview5,price_overview_2weeks_game_1,price_overview_2weeks_game_2,price_overview_2weeks_game_3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,"($9.99, USD)","(CDN$53.49, CAD)","($29.99, USD)","($29.99, USD)",,,"(S$54.00, SGD)"
4,,,,,,,,
...,...,...,...,...,...,...,...,...
20437,,,,,,,,
20438,,,,,,,,
20439,,,,,,,,
20440,,,,,,"(64,99€, EUR)","(₩44,800, KRW)",


In [100]:
raw_df[price_columns].iloc[3,1][0] # '$9.99'

'$9.99'

### 调用工具包获得最新的汇率数据

调用api获得数据

In [101]:
def get_rates(base='USD'):
    """ 获取指定基础货币对于其他货币的汇率 """
    url = f"https://api.exchangerate-api.com/v4/latest/{base}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        rates = data['rates']
        return rates
    else:
        raise Exception(f"Failed to retrieve data: {response.status_code}")

def save_rates_to_json(rates, filename="../../data/external/currency_rates.json"):
    """ 将汇率数据保存到本地JSON文件 """
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(rates, file, ensure_ascii=False, indent=4)

# 获取并打印美元的汇率，同时保存到本地文件
try:
    usd_rates = get_rates('USD')
    print("USD Currency Rates:")
    for currency, rate in usd_rates.items():
        print(f"{currency}: {rate}")
    save_rates_to_json(usd_rates)  # 保存至本地文件
    print("Rates have been saved to currency_rates.json.")
except Exception as e:
    # print(e)
    pass


USD Currency Rates:
USD: 1
AED: 3.67
AFN: 71.8
ALL: 92.39
AMD: 387.53
ANG: 1.79
AOA: 862.09
ARS: 864.75
AUD: 1.5
AWG: 1.79
AZN: 1.7
BAM: 1.8
BBD: 2
BDT: 117.37
BGN: 1.8
BHD: 0.376
BIF: 2859.83
BMD: 1
BND: 1.35
BOB: 6.92
BRL: 5.16
BSD: 1
BTN: 83.19
BWP: 13.58
BYN: 3.27
BZD: 2
CAD: 1.36
CDF: 2756.92
CHF: 0.912
CLP: 901.59
CNY: 7.25
COP: 3875.05
CRC: 516.08
CUP: 24
CVE: 101.49
CZK: 22.7
DJF: 177.72
DKK: 6.87
DOP: 58.87
DZD: 134.54
EGP: 47.51
ERN: 15
ETB: 57.49
EUR: 0.92
FJD: 2.23
FKP: 0.783
FOK: 6.87
GBP: 0.783
GEL: 2.77
GGP: 0.783
GHS: 14.98
GIP: 0.783
GMD: 64.9
GNF: 8564.2
GTQ: 7.76
GYD: 209.32
HKD: 7.81
HNL: 24.69
HRK: 6.93
HTG: 132.65
HUF: 353.65
IDR: 16076.56
ILS: 3.68
IMP: 0.783
INR: 83.19
IQD: 1310.47
IRR: 42046.82
ISK: 137.28
JEP: 0.783
JMD: 155.78
JOD: 0.709
JPY: 157.04
KES: 132.44
KGS: 88.01
KHR: 4088.51
KID: 1.5
KMF: 452.81
KRW: 1361.09
KWD: 0.307
KYD: 0.833
KZT: 441.79
LAK: 21681.02
LBP: 89500
LKR: 301.44
LRD: 193.46
LSL: 18.29
LYD: 4.85
MAD: 9.94
MDL: 17.72
MGA: 4436.18
MKD: 

读取汇率数据列表

In [102]:
with open('../../data/external/currency_rates.json','r',encoding='utf-8') as file:
    exchange_rates = json.load(file)
exchange_rates

{'USD': 1,
 'AED': 3.67,
 'AFN': 71.8,
 'ALL': 92.39,
 'AMD': 387.53,
 'ANG': 1.79,
 'AOA': 862.09,
 'ARS': 864.75,
 'AUD': 1.5,
 'AWG': 1.79,
 'AZN': 1.7,
 'BAM': 1.8,
 'BBD': 2,
 'BDT': 117.37,
 'BGN': 1.8,
 'BHD': 0.376,
 'BIF': 2859.83,
 'BMD': 1,
 'BND': 1.35,
 'BOB': 6.92,
 'BRL': 5.16,
 'BSD': 1,
 'BTN': 83.19,
 'BWP': 13.58,
 'BYN': 3.27,
 'BZD': 2,
 'CAD': 1.36,
 'CDF': 2756.92,
 'CHF': 0.912,
 'CLP': 901.59,
 'CNY': 7.25,
 'COP': 3875.05,
 'CRC': 516.08,
 'CUP': 24,
 'CVE': 101.49,
 'CZK': 22.7,
 'DJF': 177.72,
 'DKK': 6.87,
 'DOP': 58.87,
 'DZD': 134.54,
 'EGP': 47.51,
 'ERN': 15,
 'ETB': 57.49,
 'EUR': 0.92,
 'FJD': 2.23,
 'FKP': 0.783,
 'FOK': 6.87,
 'GBP': 0.783,
 'GEL': 2.77,
 'GGP': 0.783,
 'GHS': 14.98,
 'GIP': 0.783,
 'GMD': 64.9,
 'GNF': 8564.2,
 'GTQ': 7.76,
 'GYD': 209.32,
 'HKD': 7.81,
 'HNL': 24.69,
 'HRK': 6.93,
 'HTG': 132.65,
 'HUF': 353.65,
 'IDR': 16076.56,
 'ILS': 3.68,
 'IMP': 0.783,
 'INR': 83.19,
 'IQD': 1310.47,
 'IRR': 42046.82,
 'ISK': 137.28,
 'JEP':

### 根据字典将price数据全更换为USD

首先检查都有哪些货币类型

In [103]:
price_columns = [col for col in raw_df.columns if re.search(r'price', col)]
currency_type_list = raw_df[price_columns].applymap(lambda x:x[1] if x and len(x)>1 else None).stack().reset_index(drop=True).drop_duplicates().tolist()
currency_type_list

  currency_type_list = raw_df[price_columns].applymap(lambda x:x[1] if x and len(x)>1 else None).stack().reset_index(drop=True).drop_duplicates().tolist()


['USD',
 'CAD',
 'SGD',
 'INR',
 'THB',
 'KRW',
 'HKD',
 'PHP',
 'SAR',
 'CNY',
 'EUR',
 'GBP',
 'ILS',
 'RUB']

In [104]:
'''根据list的值，转换涉及金额的列'''

import re
import pandas as pd

def convert_to_usd(price_tuple):
    if pd.isna(price_tuple):
        return None
    try:
        # 分别提取价格和货币单位
        price = price_tuple[0]
        currency = price_tuple[1]

        # 仅使用正则表达式来提取数值，因为货币单位已经明确提供
        matches = re.search(r'([\$€£₹฿₩₱₪₽¥]?)(\d{1,3}(?:,\d{3})*|\d+)(\.?\d*)', price)
        if not matches:
            return None
        value_str = matches.group(2).replace(',', '')  # 去除数字中的逗号
        value = float(value_str)

        # 转换为 USD
        if currency in exchange_rates:
            return value / exchange_rates[currency]
    except Exception as e:
        print(f"Error converting {price_tuple}: {e}")
        return None


### 整理数据类型

清理id为空的行，清理重复行

In [105]:
raw_df[price_columns] = raw_df[price_columns].applymap(convert_to_usd)
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.dropna(subset='steamid')

  raw_df[price_columns] = raw_df[price_columns].applymap(convert_to_usd)


转换数据类型

In [106]:
# 首先观察所有列的数据类型
raw_df.dtypes.to_dict()

{'steamid': dtype('float64'),
 'communityvisibilitystate': dtype('float64'),
 'profilestate': dtype('float64'),
 'personaname': dtype('O'),
 'profileurl': dtype('O'),
 'avatar': dtype('O'),
 'avatarmedium': dtype('O'),
 'avatarfull': dtype('O'),
 'avatarhash': dtype('O'),
 'personastate': dtype('float64'),
 'primaryclanid': dtype('float64'),
 'timecreated': dtype('<M8[ns]'),
 'personastateflags': dtype('float64'),
 'loccountrycode': dtype('O'),
 'game_count': dtype('float64'),
 'appid_game1': dtype('float64'),
 'name_game1': dtype('O'),
 'playtime_forever_game1': dtype('float64'),
 'appid_game2': dtype('float64'),
 'name_game2': dtype('O'),
 'playtime_forever_game2': dtype('float64'),
 'appid_game3': dtype('float64'),
 'name_game3': dtype('O'),
 'playtime_forever_game3': dtype('float64'),
 'appid_game4': dtype('float64'),
 'name_game4': dtype('O'),
 'playtime_forever_game4': dtype('float64'),
 'appid_game5': dtype('float64'),
 'name_game5': dtype('O'),
 'playtime_forever_game5': dtype(

转换需要调整的数据类型,此处将所有空值转换为0，之后注意filter为0的数据

In [107]:
for i in range(len(raw_df.columns)):
    col = raw_df.columns[i]
    if raw_df[col].dtype == 'float64':
        raw_df[col] = raw_df[col].fillna(0).astype('int64')
    elif raw_df[col].dtype == 'object':        
        raw_df[col] = raw_df[col].fillna('Unknown').astype('str')
    else:
        pass

### 整理列排序

In [108]:
cleaned_df = raw_df[[
       'steamid', 'communityvisibilitystate', 'profilestate', 'personaname',
       'profileurl', 'avatar', 'avatarmedium', 'avatarfull', 'avatarhash',
       'personastate', 'primaryclanid', 'timecreated', 'personastateflags',
       'loccountrycode', 'game_count', 

       'appid_game1', 'name_game1','playtime_forever_game1', 'genres1', 'price_overview1',
       'appid_game2', 'name_game2','playtime_forever_game2', 'genres2','price_overview2',
       'appid_game3', 'name_game3','playtime_forever_game3', 'genres3', 'price_overview3',
       'appid_game4', 'name_game4','playtime_forever_game4', 'genres4', 'price_overview4',
       'appid_game5', 'name_game5','playtime_forever_game5', 'genres5', 'price_overview5',

       'appid_game1_2weeks', 'name_game1_2weeks','playtime_2weeks_game1', 'playtime_forever_game1_2weeks', 'genres_2weeks_game_1','price_overview_2weeks_game_1',
       'appid_game2_2weeks', 'name_game2_2weeks','playtime_2weeks_game2_2weeks', 'playtime_forever_game2_2weeks','genres_2weeks_game_2', 'price_overview_2weeks_game_2',
       'appid_game3_2weeks', 'name_game3_2weeks','playtime_2weeks_game3_2weeks', 'playtime_forever_game3_2weeks', 'genres_2weeks_game_3',
       'price_overview_2weeks_game_3'                
       ]]

In [109]:
cleaned_df.to_csv('../../data/processed/all_steam_and_game_data_after_cleaned.csv')