### 라이브러리 불러오기

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, gc

In [45]:
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',250)

### 공휴일 데이터 불러오기

In [46]:
holiday = pd.read_csv('./subway/holiday.csv', encoding='cp949')
holiday['날짜1'] = pd.to_datetime(holiday['날짜1'])
holiday


Unnamed: 0,날짜1,휴일명
0,1949-01-01,신정
1,1949-01-02,신정
2,1949-01-03,신정
3,1949-03-01,3·1절
4,1949-04-05,식목일
...,...,...
3711,2200-09-23,추석
3712,2200-09-24,추석
3713,2200-10-03,개천절
3714,2200-10-09,한글날


### 지하철 데이터 불러오기 
- 불러온 데이터에서 역 호선을 1~8 으로 매핑

In [47]:
# './subway/' 폴더 내의 '* 20* *.csv' 파일들을 모두 찾아서 리스트로 저장
all_subway = glob.glob(os.path.join('./subway/', '* 20* *.csv'))

# 호선 번호와 호선 이름을 매핑한 딕셔너리 생성
hosun = {f'{i}호선': i for i in range(1, 9)}
print(hosun)

# all_subway 리스트의 각 파일에 대해 반복문 실행
for idx, f in enumerate(all_subway):
    # 파일을 읽어서 DataFrame으로 저장
    subway = pd.read_csv(f, encoding='cp949', low_memory=False)
    
    # '00~01' 열의 결측값을 0으로 대체
    subway['00~01'].fillna(0, inplace=True)
    
    # 5번째 열부터 끝까지 각 열에 대해 데이터 타입 변환 작업 수행
    for c in subway.columns[5:]:
        try:
            # ',' 문자 제거 후 문자열을 숫자로 변환
            subway[c] = subway[c].str.replace(',', '')
            subway[c] = pd.to_numeric(subway[c], errors='ignore')
            subway[c] = subway[c].astype(np.float64)
        except Exception as e:
            # 변환 중 에러가 발생하면 그대로 유지
            subway[c] = pd.to_numeric(subway[c], errors='ignore')
            subway[c] = subway[c].astype(np.float64)
    
    # 첫 번째 파일인 경우 all_sub에 subway를 할당하고, 그 이후 파일은 이어붙임
    all_sub = subway if idx == 0 else pd.concat([all_sub, subway], axis=0)


{'1호선': 1, '2호선': 2, '3호선': 3, '4호선': 4, '5호선': 5, '6호선': 6, '7호선': 7, '8호선': 8}


In [48]:
# '역번호' 열의 데이터 타입을 숫자로 변환하되, 변환할 수 없는 값은 NaN으로 처리
all_sub['역번호'] = pd.to_numeric(all_sub['역번호'], errors='coerce')

# '호선' 열의 결측값을 0으로 대체
all_sub['호선'].fillna(0, inplace=True)


In [49]:
# '역번호' 열의 결측값을 뒤의 다음 유효한 값으로 채움 (뒤로 채우기)
all_sub['역번호'].fillna(method='bfill', inplace=True)

# '역번호' 열의 데이터 타입을 64비트 정수형으로 변환
all_sub['역번호'] = all_sub['역번호'].astype(np.int64)


In [50]:
all_sub[all_sub['역번호'].isna()]

Unnamed: 0,날짜,호선,역번호,역명,구분,05~06,06~07,07~08,08~09,09~10,10~11,11~12,12~13,13~14,14~15,15~16,16~17,17~18,18~19,19~20,20~21,21~22,22~23,23~24,00~01


In [51]:
# '역번호' 열의 고유한 값들을 가져와서 반복문 수행
st = {st_num: list(set(all_sub[all_sub['역번호'] == st_num]['역명']))[0].strip()
      for st_num in all_sub['역번호'].unique()}

# 결과로 생성된 딕셔너리 출력
st


{150: '서울역(150)',
 151: '시청',
 152: '종각(152)',
 153: '종로3가',
 154: '종로5가(154)',
 155: '동대문',
 156: '신설동',
 157: '제기동(157)',
 158: '청량리(지하)(158)',
 159: '동묘앞',
 201: '시청',
 202: '을지로입구(202)',
 203: '을지로3가(203)',
 204: '을지로4가(204)',
 205: '동대문역사문화공원(205)',
 206: '신당',
 207: '상왕십리',
 208: '왕십리(208)',
 209: '한양대(209)',
 210: '뚝섬',
 211: '성수(211)',
 212: '건대입구(212)',
 213: '구의(213)',
 214: '강변(동서울터미널)',
 215: '잠실나루',
 216: '잠실(송파구청)',
 217: '잠실새내',
 218: '종합운동장(218)',
 219: '삼성(219)',
 220: '선릉(220)',
 221: '역삼',
 222: '강남(222)',
 223: '교대(223)',
 224: '서초(224)',
 225: '방배(225)',
 226: '사당(226)',
 227: '낙성대(강감찬)',
 228: '서울대입구(228)',
 229: '봉천',
 230: '신림(230)',
 231: '신대방',
 232: '구로디지털단지',
 233: '대림(233)',
 234: '신도림(234)',
 235: '문래(235)',
 236: '영등포구청',
 237: '당산',
 238: '합정',
 239: '홍대입구(239)',
 240: '신촌(240)',
 241: '이대',
 242: '아현(242)',
 243: '충정로(경기대입구)',
 244: '용답',
 245: '신답(245)',
 246: '신설동(246)',
 247: '도림천',
 248: '양천구청(248)',
 249: '신정네거리(249)',
 250: '용두(동대문구청)',
 309: '지축(

In [52]:
# 빈 딕셔너리 생성
st_line_dict = {}

# '역번호' 열의 고유한 값들을 가져와서 반복문 수행
for st_num in all_sub['역번호'].unique():
    # '역번호'가 특정 값인 행들을 필터링하여 '호선' 열의 고유한 값들을 가져옴
    st_line = list(set(all_sub[all_sub['역번호'] == st_num]['호선']))
    
    # '호선' 값 중에서 문자열이 아니고 0이 아닌 값들을 필터링하여 첫 번째 값을 선택
    st_line = [l for l in st_line if (isinstance(l, str) is False) and (l != 0)][0]
    
    # '역번호'를 키(key)로, 해당 '역번호'에 대응하는 '호선'을 값(value)으로 딕셔너리에 추가
    st_line_dict[st_num] = st_line


In [53]:
# '역번호' 열의 값을 기준으로 st_line_dict 딕셔너리를 사용하여 '호선' 열의 값을 매핑
all_sub['호선'] = all_sub['역번호'].map(st_line_dict)


In [54]:
all_sub

Unnamed: 0,날짜,호선,역번호,역명,구분,05~06,06~07,07~08,08~09,09~10,10~11,11~12,12~13,13~14,14~15,15~16,16~17,17~18,18~19,19~20,20~21,21~22,22~23,23~24,00~01
0,2018-01-01,1,150,서울역,승차,373.0,318.0,365.0,785.0,1047.0,1576.0,2510.0,3233.0,3145.0,2443.0,2980.0,3476.0,3891.0,3227.0,2945.0,2382.0,3070.0,1750.0,781.0,96.0
1,2018-01-01,1,150,서울역,하차,205.0,1040.0,872.0,984.0,1650.0,1743.0,2175.0,2991.0,2877.0,2743.0,2687.0,2885.0,2845.0,2337.0,2131.0,1669.0,1404.0,868.0,477.0,147.0
2,2018-01-01,1,151,시청,승차,87.0,105.0,124.0,197.0,291.0,499.0,722.0,612.0,580.0,821.0,907.0,1027.0,1102.0,1278.0,1163.0,1032.0,975.0,553.0,214.0,9.0
3,2018-01-01,1,151,시청,하차,47.0,294.0,497.0,1017.0,673.0,657.0,820.0,958.0,1009.0,877.0,830.0,781.0,702.0,552.0,388.0,308.0,236.0,160.0,100.0,39.0
4,2018-01-01,1,152,종각,승차,604.0,399.0,191.0,250.0,370.0,439.0,705.0,980.0,1153.0,1392.0,1710.0,2127.0,2172.0,2171.0,1873.0,1935.0,2084.0,1458.0,580.0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202275,2020-12-31,8,2825,신흥,하차,31.0,95.0,78.0,211.0,145.0,165.0,166.0,164.0,221.0,286.0,341.0,341.0,381.0,428.0,292.0,229.0,219.0,85.0,42.0,0.0
202276,2020-12-31,8,2826,수진,승차,112.0,203.0,475.0,641.0,311.0,196.0,205.0,198.0,238.0,205.0,210.0,236.0,226.0,223.0,121.0,90.0,109.0,35.0,19.0,0.0
202277,2020-12-31,8,2826,수진,하차,15.0,70.0,92.0,227.0,190.0,166.0,155.0,149.0,273.0,238.0,296.0,309.0,300.0,372.0,296.0,232.0,202.0,127.0,45.0,0.0
202278,2020-12-31,8,2827,모란,승차,49.0,88.0,246.0,314.0,176.0,147.0,161.0,157.0,192.0,245.0,282.0,234.0,261.0,282.0,140.0,86.0,112.0,45.0,6.0,0.0


In [55]:
# 2008~2023_data_ver1
all_sub.to_csv('./2008~2023_data_ver1.csv', encoding='cp949', index=False)

In [56]:
def pivot_df(df_in, val_name):
    # 변환할 시간대 열들을 리스트로 정의
    time_columns = ['05~06', '06~07', '07~08', '08~09', '09~10', '10~11', '11~12', '12~13', '13~14',
                    '14~15', '15~16', '16~17', '17~18', '18~19', '19~20', '20~21', '21~22', '22~23', '23~24', '00~01']

    # melt 함수를 사용하여 데이터프레임을 변환
    df_out = pd.melt(df_in, id_vars=['날짜', '호선', '역번호', '역명', '구분'],
                     value_vars=time_columns, var_name='시간', value_name=val_name)

    # 열 이름 영어로 변경
    df_out.columns = ['Date', 'Line_num', 'Station_num', 'Station', 'Division', 'Time', val_name]

    # 'Date'와 'Time' 열을 기준으로 정렬
    df_out.sort_values(['Date', 'Time'], inplace=True)

    # 인덱스 재설정
    df_out.reset_index(drop=True, inplace=True)

    return df_out


def merge_holiday(left_df, right_df, left='Date', right='날짜1', how ='left'):
    
    df_new = pd.merge(left_df, right_df, left_on=left, right_on=right, how=how)
    
    # 공휴일 컬럼 만들기 공휴일일때 1 아닐때 0
    df_new['holiday'] = df_new['휴일명'].apply(lambda x: 1 if pd.notnull(x) else 0)

    # 필요없는 컬럼 삭제 
    df_new.drop(['날짜1','휴일명'], axis=1, inplace=True)
    
    df_new['Date'] = pd.to_datetime(df_new['Date'])

    # 월요일 : 1, 화요일 : 2, ..., 일요일 : 7
    df_new['weekday'] = df_new['Date'].dt.weekday + 1
    df_new['weekday'] = df_new['weekday'].replace(7,0)

    # 토요일, 일요일, 공휴일 일때 휴일 1로 지정 
    df_new['holiday'] = np.where((df_new['holiday'] == 1) | (df_new['weekday'].isin([0, 6])), 1, 0)
    df_new['holiday'].value_counts()
    return df_new

In [57]:
# pivot_df 함수를 사용하여 all_sub 데이터프레임을 변환하여 time_mel_df에 저장
time_mel_df = pivot_df(df_in=all_sub, val_name='flow')

# 변환된 데이터프레임인 time_mel_df 출력
time_mel_df


Unnamed: 0,Date,Line_num,Station_num,Station,Division,Time,flow
0,2008-01-01,1,150,서울역(150),승차,00~01,264.0
1,2008-01-01,1,150,서울역(150),하차,00~01,558.0
2,2008-01-01,1,151,시청(151),승차,00~01,974.0
3,2008-01-01,1,151,시청(151),하차,00~01,185.0
4,2008-01-01,1,152,종각(152),승차,00~01,1559.0
...,...,...,...,...,...,...,...
60904095,2023-04-30,8,2826,수진,하차,23~24,117.0
60904096,2023-04-30,8,2827,모란,승차,23~24,70.0
60904097,2023-04-30,8,2827,모란,하차,23~24,54.0
60904098,2023-04-30,8,2828,남위례,승차,23~24,62.0


In [58]:
# Division 열의 문자열에서 좌우 공백을 제거(strip)
time_mel_df.Division = time_mel_df.Division.str.strip()

In [59]:
# 'Date' 열의 데이터 타입을 datetime으로 변환
time_mel_df['Date'] = pd.to_datetime(time_mel_df['Date'])


In [60]:
# merge_holiday 함수를 호출하여 time_mel_df와 holiday 데이터프레임을 병합하여 merged_df에 저장
merged_df = merge_holiday(left_df=time_mel_df, right_df=holiday)


In [62]:
# merged_df에서 'Division'과 'flow' 열을 선택하여 pivot 함수를 사용하여 변환
flow_df = merged_df[['Division', 'flow']].pivot(columns='Division', values='flow')

In [64]:
# 'Division'과 'flow' 열을 제외한 열들을 선택하여 info_df에 저장
info_df = merged_df[merged_df.columns.difference(['Division', 'flow'])]

# info_df 출력
info_df


Unnamed: 0,Date,Line_num,Station,Station_num,Time,holiday,weekday
0,2008-01-01,1,서울역(150),150,00~01,1,2
1,2008-01-01,1,서울역(150),150,00~01,1,2
2,2008-01-01,1,시청(151),151,00~01,1,2
3,2008-01-01,1,시청(151),151,00~01,1,2
4,2008-01-01,1,종각(152),152,00~01,1,2
...,...,...,...,...,...,...,...
60904095,2023-04-30,8,수진,2826,23~24,1,0
60904096,2023-04-30,8,모란,2827,23~24,1,0
60904097,2023-04-30,8,모란,2827,23~24,1,0
60904098,2023-04-30,8,남위례,2828,23~24,1,0


In [65]:
# flow_df의 첫 번째 열에 해당하는 값들을 선택하여 NaN이 아닌 행들로 이루어진 get_on_v 데이터프레임 생성
get_on_v = pd.DataFrame(flow_df.iloc[:, 0]).dropna()

# flow_df의 두 번째 열에 해당하는 값들을 선택하여 NaN이 아닌 행들로 이루어진 get_off_v 데이터프레임 생성
get_off_v = pd.DataFrame(flow_df.iloc[:, 1]).dropna()



In [66]:
# get_on_v와 get_off_v를 reset_index를 사용하여 인덱스 재설정 후, 열 방향(axis=1)으로 병합하여 rec_flow 데이터프레임 생성
rec_flow = pd.concat([get_on_v.reset_index(drop=True), get_off_v.reset_index(drop=True)], axis=1)

# rec_flow의 열 이름을 'geton'과 'getoff'로 변경
rec_flow.columns = ['geton', 'getoff']

# 'geton'과 'getoff' 열을 더하여 'get_all' 열 생성
rec_flow['get_all'] = rec_flow['geton'] + rec_flow['getoff']

# info_df에서 get_on_v의 인덱스에 해당하는 행들을 선택하여 인덱스 재설정 후, rec_flow와 열 방향으로 병합하여 recon_df 데이터프레임 생성
recon_df = pd.concat([info_df.loc[get_on_v.index, :].reset_index(drop=True), rec_flow], axis=1)

# recon_df 출력
recon_df


Unnamed: 0,Date,Line_num,Station,Station_num,Time,holiday,weekday,geton,getoff,get_all
0,2008-01-01,1,서울역(150),150,00~01,1,2,264.0,558.0,822.0
1,2008-01-01,1,시청(151),151,00~01,1,2,974.0,185.0,1159.0
2,2008-01-01,1,종각(152),152,00~01,1,2,1559.0,210.0,1769.0
3,2008-01-01,1,종로3가(153),153,00~01,1,2,1499.0,384.0,1883.0
4,2008-01-01,1,종로5가(154),154,00~01,1,2,140.0,42.0,182.0
...,...,...,...,...,...,...,...,...,...,...
30452055,2023-04-30,8,단대오거리,2824,23~24,1,0,32.0,,
30452056,2023-04-30,8,신흥,2825,23~24,1,0,27.0,,
30452057,2023-04-30,8,수진,2826,23~24,1,0,31.0,,
30452058,2023-04-30,8,모란,2827,23~24,1,0,70.0,,


In [67]:
recon_df.isna().sum()

Date            0
Line_num        0
Station         0
Station_num     0
Time            0
holiday         0
weekday         0
geton           0
getoff         20
get_all        20
dtype: int64

In [68]:
recon_df.dropna(axis=0, inplace=True)

In [69]:

time_map = {
    '05~06': '05:00',
    '06~07': '06:00',
    '07~08': '07:00',
    '08~09': '08:00',
    '09~10': '09:00',
    '10~11': '10:00',
    '11~12': '11:00',
    '12~13': '12:00',
    '13~14': '13:00',
    '14~15': '14:00',
    '15~16': '15:00',
    '16~17': '16:00',
    '17~18': '17:00',
    '18~19': '18:00',
    '19~20': '19:00',
    '20~21': '20:00',
    '21~22': '21:00',
    '22~23': '22:00',
    '23~24': '23:00',
    '00~01': '00:00'
}

In [70]:
# 'Time' 열의 값을 time_map을 사용하여 매핑(mapping)
recon_df['Time'] = recon_df['Time'].map(time_map)

In [71]:
recon_df

Unnamed: 0,Date,Line_num,Station,Station_num,Time,holiday,weekday,geton,getoff,get_all
0,2008-01-01,1,서울역(150),150,00:00,1,2,264.0,558.0,822.0
1,2008-01-01,1,시청(151),151,00:00,1,2,974.0,185.0,1159.0
2,2008-01-01,1,종각(152),152,00:00,1,2,1559.0,210.0,1769.0
3,2008-01-01,1,종로3가(153),153,00:00,1,2,1499.0,384.0,1883.0
4,2008-01-01,1,종로5가(154),154,00:00,1,2,140.0,42.0,182.0
...,...,...,...,...,...,...,...,...,...,...
30452035,2023-04-30,7,대림(구로구청),2746,23:00,1,0,121.0,281.0,402.0
30452036,2023-04-30,7,남구로,2747,23:00,1,0,81.0,114.0,195.0
30452037,2023-04-30,7,가산디지털단지,2748,23:00,1,0,105.0,117.0,222.0
30452038,2023-04-30,7,철산,2749,23:00,1,0,194.0,54.0,248.0


In [72]:
weather = pd.read_csv('./weather/2008~2023_weather.csv', encoding='cp949')
weather.shape

(134370, 8)

In [73]:
# weather 데이터프레임의 '강수량(mm)'와 '적설(cm)' 열의 NaN 값을 0으로 대체하여 채우기
weather[['강수량(mm)', '적설(cm)']] = weather[['강수량(mm)', '적설(cm)']].fillna(0)

# 변경된 weather 데이터프레임 출력
weather


Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),적설(cm)
0,108,서울,2008-01-01 00:00,-7.0,0.0,3.5,52.0,0.0
1,108,서울,2008-01-01 01:00,-7.3,0.0,4.9,52.0,0.0
2,108,서울,2008-01-01 02:00,-7.5,0.0,4.1,52.0,0.0
3,108,서울,2008-01-01 03:00,-7.8,0.0,3.9,52.0,0.0
4,108,서울,2008-01-01 04:00,-7.9,0.0,4.0,53.0,0.0
...,...,...,...,...,...,...,...,...
134365,108,서울,2023-04-30 19:00,14.3,0.0,3.0,74.0,0.0
134366,108,서울,2023-04-30 20:00,13.7,0.0,3.0,80.0,0.0
134367,108,서울,2023-04-30 21:00,13.3,0.1,3.2,83.0,0.0
134368,108,서울,2023-04-30 22:00,12.8,0.0,1.8,81.0,0.0


In [74]:
# '일시' 열의 데이터 타입을 datetime으로 변환
weather['일시'] = pd.to_datetime(weather['일시'])

# '일시' 열에서 날짜를 추출하여 'Date' 열 생성
weather['Date'] = weather['일시'].dt.strftime('%Y-%m-%d')

# '일시' 열에서 시간을 추출하여 'Time' 열 생성
weather['Time'] = weather['일시'].dt.strftime('%H:%M')

# 필요없는 열들을 삭제하고 열 이름 변경
weather = weather.drop(columns=['지점', '지점명', '일시'])
weather.columns = ['Temp', 'Rainfall_amt', 'Wind_speed', 'Humidity', 'Snow_amt', 'Date', 'Time']

# 변경된 weather 데이터프레임 출력
weather


Unnamed: 0,Temp,Rainfall_amt,Wind_speed,Humidity,Snow_amt,Date,Time
0,-7.0,0.0,3.5,52.0,0.0,2008-01-01,00:00
1,-7.3,0.0,4.9,52.0,0.0,2008-01-01,01:00
2,-7.5,0.0,4.1,52.0,0.0,2008-01-01,02:00
3,-7.8,0.0,3.9,52.0,0.0,2008-01-01,03:00
4,-7.9,0.0,4.0,53.0,0.0,2008-01-01,04:00
...,...,...,...,...,...,...,...
134365,14.3,0.0,3.0,74.0,0.0,2023-04-30,19:00
134366,13.7,0.0,3.0,80.0,0.0,2023-04-30,20:00
134367,13.3,0.1,3.2,83.0,0.0,2023-04-30,21:00
134368,12.8,0.0,1.8,81.0,0.0,2023-04-30,22:00


In [75]:
# 날짜별로 평균을 계산하여 w_mean 데이터프레임 생성
w_mean = weather.groupby(['Date']).mean()

# 열 이름에 '_mean'을 추가하여 열 이름 변경
w_mean.columns = [f'{c}_mean' for c in w_mean.columns]

# 날짜별로 강수량과 적설량을 합산하여 w_sum 데이터프레임 생성
w_sum = weather.groupby(['Date'])[['Rainfall_amt', 'Snow_amt']].sum()

# 열 이름에 '_sum'을 추가하여 열 이름 변경
w_sum.columns = [f'{c}_sum' for c in w_sum.columns]

# 날짜별로 최고 기온을 계산하여 w_max 데이터프레임 생성
w_max = weather.groupby(['Date'])[['Temp']].max()

# 열 이름에 '_max'를 추가하여 열 이름 변경
w_max.columns = [f'{c}_max' for c in w_max.columns]

# 날짜별로 최저 기온을 계산하여 w_min 데이터프레임 생성
w_min = weather.groupby(['Date'])[['Temp']].min()

# 열 이름에 '_min'을 추가하여 열 이름 변경
w_min.columns = [f'{c}_min' for c in w_min.columns]

# w_mean, w_sum, w_max, w_min을 열 방향으로 병합하여 w_stats 데이터프레임 생성
w_stats = pd.concat([w_mean, w_sum, w_max, w_min], axis=1)

# 최고 기온과 최저 기온의 차이를 계산하여 'Temp_diff' 열 생성
w_stats['Temp_diff'] = np.abs(w_max.values - w_min.values)

# w_stats를 데이터프레임으로 변환하고 인덱스를 재설정
w_stats = pd.DataFrame(w_stats).reset_index(drop=False)

# w_stats 출력
display(w_stats)


  w_mean = weather.groupby(['Date']).mean()


Unnamed: 0,Date,Temp_mean,Rainfall_amt_mean,Wind_speed_mean,Humidity_mean,Snow_amt_mean,Rainfall_amt_sum,Snow_amt_sum,Temp_max,Temp_min,Temp_diff
0,2008-01-01,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
1,2008-01-02,-3.720833,0.000000,2.216667,49.708333,0.0,0.0,0.0,0.4,-7.1,7.5
2,2008-01-03,0.866667,0.000000,2.416667,51.500000,0.0,0.0,0.0,6.4,-3.0,9.4
3,2008-01-04,-0.379167,0.000000,2.116667,55.583333,0.0,0.0,0.0,3.5,-4.0,7.5
4,2008-01-05,1.504167,0.000000,1.466667,60.500000,0.0,0.0,0.0,5.6,-2.0,7.6
...,...,...,...,...,...,...,...,...,...,...,...
5594,2023-04-26,10.225000,0.000000,3.233333,71.041667,0.0,0.0,0.0,13.2,8.0,5.2
5595,2023-04-27,12.504167,0.000000,1.983333,64.041667,0.0,0.0,0.0,19.8,5.2,14.6
5596,2023-04-28,15.175000,0.012500,2.166667,59.958333,0.0,0.3,0.0,20.9,9.1,11.8
5597,2023-04-29,13.670833,1.112500,2.725000,90.000000,0.0,26.7,0.0,15.7,9.9,5.8


In [76]:
# weather와 w_stats를 'Date' 열을 기준으로 left 조인하여 weather 데이터프레임에 추가
weather = pd.merge(left=weather, right=w_stats, how='left', on='Date')

# 변경된 weather 데이터프레임 출력
weather


Unnamed: 0,Temp,Rainfall_amt,Wind_speed,Humidity,Snow_amt,Date,Time,Temp_mean,Rainfall_amt_mean,Wind_speed_mean,Humidity_mean,Snow_amt_mean,Rainfall_amt_sum,Snow_amt_sum,Temp_max,Temp_min,Temp_diff
0,-7.0,0.0,3.5,52.0,0.0,2008-01-01,00:00,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
1,-7.3,0.0,4.9,52.0,0.0,2008-01-01,01:00,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
2,-7.5,0.0,4.1,52.0,0.0,2008-01-01,02:00,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
3,-7.8,0.0,3.9,52.0,0.0,2008-01-01,03:00,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
4,-7.9,0.0,4.0,53.0,0.0,2008-01-01,04:00,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134365,14.3,0.0,3.0,74.0,0.0,2023-04-30,19:00,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6
134366,13.7,0.0,3.0,80.0,0.0,2023-04-30,20:00,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6
134367,13.3,0.1,3.2,83.0,0.0,2023-04-30,21:00,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6
134368,12.8,0.0,1.8,81.0,0.0,2023-04-30,22:00,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6


In [77]:
def merge_weather(sub_df, weather_df):
    """
    지하철 데이터와 날씨 데이터를 병합하는 함수입니다.
    
    Args:
        sub_df (DataFrame): 지하철 데이터프레임.
        weather_df (DataFrame): 날씨 데이터프레임.
        
    Returns:
        DataFrame: 병합된 지하철-날씨 데이터프레임.
    """
    # 날짜 열을 datetime 형식으로 변환
    sub_df['Date'] = pd.to_datetime(sub_df['Date'])
    weather_df['Date'] = pd.to_datetime(weather_df['Date'])
    
    # 지하철과 날씨 데이터를 날짜와 시간을 기준으로 left 조인하여 병합
    subway_weather_df = pd.merge(sub_df, weather_df, on=['Date', 'Time'], how='left')
    
    # 날짜와 시간을 기준으로 정렬
    subway_weather_df.sort_values(['Date', 'Time'], inplace=True)
    subway_weather_df.reset_index(drop=True, inplace=True)
    
    # 시간, 연도, 월, 일을 추출하여 열로 추가
    subway_weather_df['hour'] = pd.to_datetime(subway_weather_df['Time']).dt.hour
    subway_weather_df["year"] = subway_weather_df["Date"].dt.year
    subway_weather_df["month"] = subway_weather_df["Date"].dt.month
    subway_weather_df["day"] = subway_weather_df["Date"].dt.day
    
    return subway_weather_df



In [78]:
# recon_df와 weather를 이용하여 merge_weather 함수를 호출하여 dfnew_all에 저장
dfnew_all = merge_weather(sub_df=recon_df, weather_df=weather)


In [79]:
dfnew_all

Unnamed: 0,Date,Line_num,Station,Station_num,Time,holiday,weekday,geton,getoff,get_all,Temp,Rainfall_amt,Wind_speed,Humidity,Snow_amt,Temp_mean,Rainfall_amt_mean,Wind_speed_mean,Humidity_mean,Snow_amt_mean,Rainfall_amt_sum,Snow_amt_sum,Temp_max,Temp_min,Temp_diff,hour,year,month,day
0,2008-01-01,1,서울역(150),150,00:00,1,2,264.0,558.0,822.0,-7.0,0.0,3.5,52.0,0.0,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9,0,2008,1,1
1,2008-01-01,1,시청(151),151,00:00,1,2,974.0,185.0,1159.0,-7.0,0.0,3.5,52.0,0.0,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9,0,2008,1,1
2,2008-01-01,1,종각(152),152,00:00,1,2,1559.0,210.0,1769.0,-7.0,0.0,3.5,52.0,0.0,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9,0,2008,1,1
3,2008-01-01,1,종로3가(153),153,00:00,1,2,1499.0,384.0,1883.0,-7.0,0.0,3.5,52.0,0.0,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9,0,2008,1,1
4,2008-01-01,1,종로5가(154),154,00:00,1,2,140.0,42.0,182.0,-7.0,0.0,3.5,52.0,0.0,-6.337500,0.000000,3.766667,47.541667,0.0,0.0,0.0,-3.4,-8.3,4.9,0,2008,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30452035,2023-04-30,7,대림(구로구청),2746,23:00,1,0,121.0,281.0,402.0,12.1,0.0,2.7,86.0,0.0,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6,23,2023,4,30
30452036,2023-04-30,7,남구로,2747,23:00,1,0,81.0,114.0,195.0,12.1,0.0,2.7,86.0,0.0,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6,23,2023,4,30
30452037,2023-04-30,7,가산디지털단지,2748,23:00,1,0,105.0,117.0,222.0,12.1,0.0,2.7,86.0,0.0,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6,23,2023,4,30
30452038,2023-04-30,7,철산,2749,23:00,1,0,194.0,54.0,248.0,12.1,0.0,2.7,86.0,0.0,12.479167,0.004167,3.233333,72.208333,0.0,0.1,0.0,17.5,7.9,9.6,23,2023,4,30


In [80]:
dfnew_all.isnull().sum()

Date                    0
Line_num                0
Station                 0
Station_num             0
Time                    0
holiday                 0
weekday                 0
geton                   0
getoff                  0
get_all                 0
Temp                 1366
Rainfall_amt         1366
Wind_speed           1366
Humidity             1366
Snow_amt             1366
Temp_mean            1366
Rainfall_amt_mean    1366
Wind_speed_mean      1366
Humidity_mean        1366
Snow_amt_mean        1366
Rainfall_amt_sum     1366
Snow_amt_sum         1366
Temp_max             1366
Temp_min             1366
Temp_diff            1366
hour                    0
year                    0
month                   0
day                     0
dtype: int64

In [97]:
dfnew_all = dfnew_all.drop(dfnew_all[dfnew_all['get_all'] == 0].index)


In [101]:
dfnew_all.dropna(axis=0, inplace=True)

In [104]:
dfnew_all.to_csv('./2008~2023_data.csv',encoding='cp949', index=False)

In [105]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path, encoding='cp949')
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [106]:
csv_to_parquet('./2008~2023_data.csv', '2008~2023_data')

2008~2023_data Done.


: 