In [339]:
import pandas as pd
import numpy as np

# 1. 시카고 마라톤 원본 데이터 불러오기
chicago_df = pd.read_csv("./data/chicago_data.csv")

In [340]:
# 2. 구간 기록 컬럼명을 보스턴 데이터와 동일하게 통일
split_mapping = {
    '5km.time': '5K',
    '10km.time': '10K',
    '15km.time': '15K',
    '20km.time': '20K',
    'half.time': 'Half',
    '25km.time': '25K',
    '30km.time': '30K',
    '35km.time': '35K',
    '40km.time': '40K',
    'finish.time': 'Official Time'
}
chicago_df = chicago_df.rename(columns=split_mapping)

In [341]:
# 3. 문자열 형태의 시간 데이터를 pandas timedelta로 변환
for col in split_mapping.values():
    chicago_df[col] = pd.to_timedelta(chicago_df[col], errors='coerce')

In [342]:
# 4. 공식 완주시간(Official Time)을 시간 단위로 변환해 새로운 컬럼 생성
chicago_df["Official Time Hour"] = chicago_df["Official Time"].dt.total_seconds() / 3600

In [343]:
# 5. 연령대 그룹화 (20대, 30대, ...) - 보스턴 데이터와 통일된 형식

def get_age_group(val):
    try:
        val = int(str(val)[:2])  # 예: '20-24' → 20
        return val if val >= 20 else 20
    except:
        return np.nan

chicago_df["Age group"] = chicago_df["age_class"].map(get_age_group)

In [344]:
# 6. 성별 표기 통일: man/woman → M/F
chicago_df["Gender"] = chicago_df["gender"].map({"man": "M", "woman": "F"})

In [345]:
# 7. 각 구간별 페이스 계산 (분/km 기준)
dist_cols = ["5K", "10K", "15K", "20K", "Half", "25K", "30K", "35K", "40K"]
dists = [5, 10, 15, 20, 21.0975, 25, 30, 35, 40]

for col, dist in zip(dist_cols, dists):
    chicago_df[col + "p"] = chicago_df[col].dt.total_seconds() / (dist * 60)

split_mapping = {
    '5Kp': '5p',
    '10Kp': '10p',
    '15Kp': '15p',
    '20Kp': '20p',
    'Halfp': 'Halfp',
    '25Kp': '25p',
    '30Kp': '30p',
    '35Kp': '35p',
    '40Kp': '40p',
}
chicago_df = chicago_df.rename(columns=split_mapping)

In [346]:
# chicago_df.info()


In [347]:
# 8. '.time_of_day'로 끝나는 모든 컬럼 제거
cols_to_drop = [col for col in chicago_df.columns if col.endswith(".time_of_day")]
chicago_df.drop(columns=cols_to_drop, inplace=True)

def age_group_by_upper(val):
    try:
        return int(str(val).split('-')[1])  # '20-24' → 24
    except:
        return np.nan

# Age_class 결측값 제거 및 전처리
chicago_df = chicago_df[~(chicago_df["age_class"].isna() | (chicago_df["age_class"] == "MT53"))]
chicago_df['age_class'] = chicago_df['age_class'].map(age_group_by_upper)



In [348]:
chicago_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26950 entries, 0 to 26999
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   name                26950 non-null  object         
 1   gender              26950 non-null  object         
 2   country             26950 non-null  object         
 3   bib                 0 non-null      float64        
 4   age_class           26861 non-null  float64        
 5   finish_time         26950 non-null  object         
 6   place_gender        26950 non-null  int64          
 7   place_overall       26950 non-null  int64          
 8   details_url         26950 non-null  object         
 9   city_state          26927 non-null  object         
 10  start.time          26950 non-null  object         
 11  5K                  26805 non-null  timedelta64[ns]
 12  10K                 26912 non-null  timedelta64[ns]
 13  15K                 26916 non-null  

In [349]:
# 피처명 수정
chicago_df.rename(columns={"bib": "Bib", "name": "Name", "Gender": "M/F", "place_gender": "Gender", "place_overall": "Overall"}, inplace=True)


In [350]:
# 나이(Age)를 기준으로 연령대 그룹(구간)으로 나누는 함수 ex. 0~19 -> 20 / 20~29 -> 30
# def group_calculator(serie):
#     list=[]
#     for i in range(len(serie)):
#         if serie[i]<20:
#             list.append(20)
#         elif serie[i]<30:
#             list.append(30)
#         elif serie[i]<40:
#             list.append(40)
#         elif serie[i]<60:
#             list.append(60)
#         elif serie[i]<80:
#             list.append(80)
#         else:
#             list.append(100)
    
#     return list
def group_calculator(val):
    try:
        val = float(val)
        if val < 30:
            return 20
        elif val < 40:
            return 30
        elif val < 60:
            return 40
        elif val < 80:
            return 60
        elif val < 100:
            return 80
        else:
            return 100
    except:
        return np.nan

    
# 연령대 그룹 (20, 30, 40, 60, 80, 100) 생성 후, Age group 컬럼 생성
# chicago_df['Age group']=pd.Series(group_calculator(chicago_df['age_class']))
chicago_df['Age group'] = chicago_df['age_class'].map(group_calculator)

In [351]:

# 완주시간 결측값 제거
chicago_df = chicago_df[chicago_df["Official Time"].notna()]

chicago_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 26928 entries, 0 to 26999
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   Name                26928 non-null  object         
 1   gender              26928 non-null  object         
 2   country             26928 non-null  object         
 3   Bib                 0 non-null      float64        
 4   age_class           26861 non-null  float64        
 5   finish_time         26928 non-null  object         
 6   Gender              26928 non-null  int64          
 7   Overall             26928 non-null  int64          
 8   details_url         26928 non-null  object         
 9   city_state          26927 non-null  object         
 10  start.time          26928 non-null  object         
 11  5K                  26805 non-null  timedelta64[ns]
 12  10K                 26912 non-null  timedelta64[ns]
 13  15K                 26916 non-null  

In [352]:
# 'Pace' 칼럼 추가 - 러닝 전체 평균 pace

# 우선 Official Time을 시간 문자열(hh:mm:ss) → 분 단위 float 값으로 변환
chicago_df["Official Time (min)"] = pd.to_timedelta(chicago_df["Official Time"]).dt.total_seconds() / 60

# 전체 마라톤 거리로 나눠 평균 pace (분/km) 계산
chicago_df["Pace"] = chicago_df["Official Time (min)"] / 42.195

def format_pace(pace_float):
    minutes = int(pace_float)
    seconds = int(round((pace_float - minutes) * 60))
    return f"{minutes}:{seconds:02d}"

# 'pace' 컬럼을 문자열 형식으로 변환해서 새로운 컬럼으로 저장
chicago_df["Pace"] = chicago_df["Pace"].apply(format_pace)



In [353]:
chicago_df = chicago_df.drop(columns=['gender', 'age_class', 'Official Time (min)', 'country', 'details_url', 'start.time'])


chicago_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26928 entries, 0 to 26999
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   Name                26928 non-null  object         
 1   Bib                 0 non-null      float64        
 2   finish_time         26928 non-null  object         
 3   Gender              26928 non-null  int64          
 4   Overall             26928 non-null  int64          
 5   city_state          26927 non-null  object         
 6   5K                  26805 non-null  timedelta64[ns]
 7   10K                 26912 non-null  timedelta64[ns]
 8   15K                 26916 non-null  timedelta64[ns]
 9   20K                 26918 non-null  timedelta64[ns]
 10  Half                26920 non-null  timedelta64[ns]
 11  25K                 26921 non-null  timedelta64[ns]
 12  30K                 26922 non-null  timedelta64[ns]
 13  35K                 26923 non-null  

In [354]:
# 목표 그룹 함수 정의
def time_group(t):
    if t < 3.0:
        return "Sub-3"
    elif t < 4.0:
        return "Sub-4"
    elif t < 5.0:
        return "Sub-5"
    else:
        return "5h+"

# 그룹 컬럼 생성
chicago_df['target_group'] = chicago_df['Official Time Hour'].apply(time_group)

In [355]:
import numpy as np

# 총 행 수
num_rows = len(chicago_df)

# 중복 없이 무작위 숫자 생성 (예: 100000 ~ 999999 사이)
chicago_df["Bib"] = np.random.choice(range(1, num_rows+1), size=num_rows, replace=False)


In [356]:
# 전처리 완료된 파일 저장
chicago_df.to_csv("./data/chicago_data_processed.csv", index=False)
print("[✓] chicago_data_processed.csv 저장 완료")

PermissionError: [Errno 13] Permission denied: './data/chicago_data_processed.csv'