In [1]:
import os
import pandas as pd 

In [2]:
data_dir = 'data'
file_list = os.listdir(data_dir)
file_list

['2023년도 데이터.xlsx',
 '2023년.csv',
 '2022년도 데이터.xlsx',
 '2022년.csv',
 '2021년도 데이터.xlsx',
 '2021년.csv',
 'total_df.pkl']

In [3]:
ob_list = [file for file in file_list if file.endswith('xlsx')]
ob_list

['2023년도 데이터.xlsx', '2022년도 데이터.xlsx', '2021년도 데이터.xlsx']

In [4]:
we_list = [file for file in file_list if file.endswith('csv')]
we_list

['2023년.csv', '2022년.csv', '2021년.csv']

### 측정소 파일 통합

In [5]:
%%time
ob_df_list = []
for file in ob_list:
    df = pd.read_excel(os.path.join(data_dir, file))
    ob_df_list.append(df)

CPU times: total: 1min 21s
Wall time: 1min 21s


In [6]:
ob_df = pd.concat(ob_df_list)
ob_df.reset_index(drop = True, inplace = True)

In [7]:
ob_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595516 entries, 0 to 595515
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   측정소코드   595516 non-null  int64  
 1   측정소명    595516 non-null  object 
 2   일시      595516 non-null  object 
 3   PM2.5   583563 non-null  float64
 4   풍향      594970 non-null  float64
 5   풍속      594860 non-null  float64
 6   온도      594371 non-null  float64
 7   습도      594368 non-null  float64
dtypes: float64(5), int64(1), object(2)
memory usage: 36.3+ MB


In [8]:
ob_df.head()

Unnamed: 0,측정소코드,측정소명,일시,PM2.5,풍향,풍속,온도,습도
0,111261,강남구,2023/01/01 00:00:00,37.0,99.0,1.0,1.6,62.0
1,111261,강남구,2023/01/01 01:00:00,53.0,276.0,0.8,1.1,66.0
2,111261,강남구,2023/01/01 02:00:00,53.0,322.0,1.1,2.3,63.0
3,111261,강남구,2023/01/01 03:00:00,56.0,281.0,1.3,2.4,65.0
4,111261,강남구,2023/01/01 04:00:00,59.0,299.0,1.8,1.9,63.0


### 기상청 파일 통합

In [9]:
%%time
we_df_list = []
for file in we_list:
    df = pd.read_csv(os.path.join(data_dir, file), encoding='cp949')
    we_df_list.append(df)
we_df = pd.concat(we_df_list)
we_df.reset_index(drop = True, inplace = True)

CPU times: total: 438 ms
Wall time: 668 ms


In [10]:
we_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679576 entries, 0 to 679575
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   지점       679576 non-null  int64  
 1   지점명      679576 non-null  object 
 2   일시       679576 non-null  object 
 3   강수량(mm)  679576 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 20.7+ MB


In [11]:
we_df.head()

Unnamed: 0,지점,지점명,일시,강수량(mm)
0,116,관악(레),2023-01-01 01:00,0.0
1,116,관악(레),2023-01-01 02:00,0.0
2,116,관악(레),2023-01-01 03:00,0.0
3,116,관악(레),2023-01-01 04:00,0.0
4,116,관악(레),2023-01-01 05:00,0.0


### 측정소 및 기상청 데이터 컬럼 변경

In [12]:
pd.unique(ob_df.측정소명)

array(['강남구', '강동구', '강북구', '강서구', '관악구', '광진구', '구로구', '금천구', '노원구',
       '도봉구', '동대문구', '동작구', '마포구', '서대문구', '서초구', '성동구', '성북구', '송파구',
       '양천구', '영등포구', '용산구', '은평구', '종로구', '중구', '중랑구'], dtype=object)

In [13]:
pd.unique(we_df.지점명)

array(['관악(레)', '강남', '서초', '강동', '송파', '강서', '양천', '도봉', '노원', '동대문',
       '중랑', '기상청', '마포', '서대문', '광진', '성북', '용산', '은평', '금천', '한강', '중구',
       '성동', '구로', '강북', '남현', '관악', '영등포 ', '현충원'], dtype=object)

In [14]:
ob_df.측정소명= ob_df.측정소명.map(lambda st: '중구' if st == '중구' else st[:len(st)-1])
pd.unique(ob_df.측정소명)

array(['강남', '강동', '강북', '강서', '관악', '광진', '구로', '금천', '노원', '도봉', '동대문',
       '동작', '마포', '서대문', '서초', '성동', '성북', '송파', '양천', '영등포', '용산', '은평',
       '종로', '중구', '중랑'], dtype=object)

In [15]:
# we_df.지점명 = we_df.지점명.map(lambda st: '관악' if st == '관악(레)' else st.strip())
# pd.unique(we_df.지점명)

In [16]:
pd.unique(ob_df[~ob_df.측정소명.isin(we_df.지점명)]['측정소명'])

array(['동작', '영등포', '종로'], dtype=object)

In [17]:
pd.unique(we_df[~we_df.지점명.isin(ob_df.측정소명)]['지점명'])

array(['관악(레)', '기상청', '한강', '남현', '영등포 ', '현충원'], dtype=object)

In [18]:
ob_df.일시 = ob_df.일시.map(lambda st: st.replace('/','-'))
ob_df.head()

Unnamed: 0,측정소코드,측정소명,일시,PM2.5,풍향,풍속,온도,습도
0,111261,강남,2023-01-01 00:00:00,37.0,99.0,1.0,1.6,62.0
1,111261,강남,2023-01-01 01:00:00,53.0,276.0,0.8,1.1,66.0
2,111261,강남,2023-01-01 02:00:00,53.0,322.0,1.1,2.3,63.0
3,111261,강남,2023-01-01 03:00:00,56.0,281.0,1.3,2.4,65.0
4,111261,강남,2023-01-01 04:00:00,59.0,299.0,1.8,1.9,63.0


In [19]:
we_df.head()

Unnamed: 0,지점,지점명,일시,강수량(mm)
0,116,관악(레),2023-01-01 01:00,0.0
1,116,관악(레),2023-01-01 02:00,0.0
2,116,관악(레),2023-01-01 03:00,0.0
3,116,관악(레),2023-01-01 04:00,0.0
4,116,관악(레),2023-01-01 05:00,0.0


In [20]:
we_df.일시 = we_df.일시.map(lambda st: st+':00')
we_df.head()

Unnamed: 0,지점,지점명,일시,강수량(mm)
0,116,관악(레),2023-01-01 01:00:00,0.0
1,116,관악(레),2023-01-01 02:00:00,0.0
2,116,관악(레),2023-01-01 03:00:00,0.0
3,116,관악(레),2023-01-01 04:00:00,0.0
4,116,관악(레),2023-01-01 05:00:00,0.0


### 측정소 및 기상청 데이터 병합

In [21]:
merge_df = ob_df.merge(we_df, left_on = ['일시', '측정소명'], right_on = ['일시', '지점명'])
merge_df.head()

Unnamed: 0,측정소코드,측정소명,일시,PM2.5,풍향,풍속,온도,습도,지점,지점명,강수량(mm)
0,111261,강남,2023-01-01 01:00:00,53.0,276.0,0.8,1.1,66.0,400,강남,0.0
1,111261,강남,2023-01-01 02:00:00,53.0,322.0,1.1,2.3,63.0,400,강남,0.0
2,111261,강남,2023-01-01 03:00:00,56.0,281.0,1.3,2.4,65.0,400,강남,0.0
3,111261,강남,2023-01-01 04:00:00,59.0,299.0,1.8,1.9,63.0,400,강남,0.0
4,111261,강남,2023-01-01 05:00:00,47.0,296.0,2.1,1.1,50.0,400,강남,0.0


In [22]:
merge_df = merge_df.loc[:, ['일시', '측정소명', '풍향', '풍속', '온도', '습도', '강수량(mm)', 'PM2.5']]
merge_df[:3]

Unnamed: 0,일시,측정소명,풍향,풍속,온도,습도,강수량(mm),PM2.5
0,2023-01-01 01:00:00,강남,276.0,0.8,1.1,66.0,0.0,53.0
1,2023-01-01 02:00:00,강남,322.0,1.1,2.3,63.0,0.0,53.0
2,2023-01-01 03:00:00,강남,281.0,1.3,2.4,65.0,0.0,56.0


In [23]:
merge_df.rename(columns={'강수량(mm)':'강수량', 'PM2.5':'PM'}, inplace = True)
merge_df

Unnamed: 0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
0,2023-01-01 01:00:00,강남,276.0,0.8,1.1,66.0,0.0,53.0
1,2023-01-01 02:00:00,강남,322.0,1.1,2.3,63.0,0.0,53.0
2,2023-01-01 03:00:00,강남,281.0,1.3,2.4,65.0,0.0,56.0
3,2023-01-01 04:00:00,강남,299.0,1.8,1.9,63.0,0.0,59.0
4,2023-01-01 05:00:00,강남,296.0,2.1,1.1,50.0,0.0,47.0
...,...,...,...,...,...,...,...,...
516883,2021-12-31 19:00:00,중랑,299.0,1.5,-5.5,27.0,0.0,7.0
516884,2021-12-31 20:00:00,중랑,339.0,1.6,-5.9,27.0,0.0,8.0
516885,2021-12-31 21:00:00,중랑,347.0,1.8,-6.5,29.0,0.0,7.0
516886,2021-12-31 22:00:00,중랑,12.0,1.7,-7.0,29.0,0.0,7.0


### 측정소별 결측값 확인

In [24]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 516888 entries, 0 to 516887
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   일시      516888 non-null  object 
 1   측정소명    516888 non-null  object 
 2   풍향      516394 non-null  float64
 3   풍속      516312 non-null  float64
 4   온도      515887 non-null  float64
 5   습도      515864 non-null  float64
 6   강수량     516888 non-null  float64
 7   PM      505994 non-null  float64
dtypes: float64(6), object(2)
memory usage: 35.5+ MB


In [25]:
d_range = pd.date_range('2021-01-01', '2023-09-30')
d_range

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10',
               ...
               '2023-09-21', '2023-09-22', '2023-09-23', '2023-09-24',
               '2023-09-25', '2023-09-26', '2023-09-27', '2023-09-28',
               '2023-09-29', '2023-09-30'],
              dtype='datetime64[ns]', length=1003, freq='D')

In [26]:
from datetime import datetime, timedelta

dt_range = []
for d in d_range:
    dt = d
    for i in range(0, 24):
        dt_range.append(str(dt))
        dt = dt + timedelta(hours = 1)

In [27]:
dt_range

['2021-01-01 00:00:00',
 '2021-01-01 01:00:00',
 '2021-01-01 02:00:00',
 '2021-01-01 03:00:00',
 '2021-01-01 04:00:00',
 '2021-01-01 05:00:00',
 '2021-01-01 06:00:00',
 '2021-01-01 07:00:00',
 '2021-01-01 08:00:00',
 '2021-01-01 09:00:00',
 '2021-01-01 10:00:00',
 '2021-01-01 11:00:00',
 '2021-01-01 12:00:00',
 '2021-01-01 13:00:00',
 '2021-01-01 14:00:00',
 '2021-01-01 15:00:00',
 '2021-01-01 16:00:00',
 '2021-01-01 17:00:00',
 '2021-01-01 18:00:00',
 '2021-01-01 19:00:00',
 '2021-01-01 20:00:00',
 '2021-01-01 21:00:00',
 '2021-01-01 22:00:00',
 '2021-01-01 23:00:00',
 '2021-01-02 00:00:00',
 '2021-01-02 01:00:00',
 '2021-01-02 02:00:00',
 '2021-01-02 03:00:00',
 '2021-01-02 04:00:00',
 '2021-01-02 05:00:00',
 '2021-01-02 06:00:00',
 '2021-01-02 07:00:00',
 '2021-01-02 08:00:00',
 '2021-01-02 09:00:00',
 '2021-01-02 10:00:00',
 '2021-01-02 11:00:00',
 '2021-01-02 12:00:00',
 '2021-01-02 13:00:00',
 '2021-01-02 14:00:00',
 '2021-01-02 15:00:00',
 '2021-01-02 16:00:00',
 '2021-01-02 17:

In [28]:
df_range = pd.DataFrame({'전체일시':dt_range})
df_range

Unnamed: 0,전체일시
0,2021-01-01 00:00:00
1,2021-01-01 01:00:00
2,2021-01-01 02:00:00
3,2021-01-01 03:00:00
4,2021-01-01 04:00:00
...,...
24067,2023-09-30 19:00:00
24068,2023-09-30 20:00:00
24069,2023-09-30 21:00:00
24070,2023-09-30 22:00:00


In [29]:
for k in pd.unique(merge_df.측정소명):
    df = merge_df[merge_df.측정소명 == k]
    dt_missing = len(df_range[~df_range.전체일시.isin(df.일시)])
    pm_missing = len(df[df.PM.isnull()])
    print(f'{k} : {dt_missing}, {round(dt_missing/len(dt_range)*100, 2)}%')
    print(f'{k} : {pm_missing}, {round(pm_missing/len(dt_range)*100, 2)}%')
    print(f'{k} : {dt_missing + pm_missing}, {round((dt_missing + pm_missing)/len(dt_range)*100, 2)}%')
    print('-' *100)

강남 : 354, 1.47%
강남 : 239, 0.99%
강남 : 593, 2.46%
----------------------------------------------------------------------------------------------------
강동 : 1492, 6.2%
강동 : 399, 1.66%
강동 : 1891, 7.86%
----------------------------------------------------------------------------------------------------
강북 : 348, 1.45%
강북 : 323, 1.34%
강북 : 671, 2.79%
----------------------------------------------------------------------------------------------------
강서 : 394, 1.64%
강서 : 321, 1.33%
강서 : 715, 2.97%
----------------------------------------------------------------------------------------------------
관악 : 497, 2.06%
관악 : 357, 1.48%
관악 : 854, 3.55%
----------------------------------------------------------------------------------------------------
광진 : 404, 1.68%
광진 : 286, 1.19%
광진 : 690, 2.87%
----------------------------------------------------------------------------------------------------
구로 : 331, 1.38%
구로 : 1062, 4.41%
구로 : 1393, 5.79%
-------------------------------------------------------

In [30]:
len(dt_range)

24072

In [31]:
len(merge_df)

516888

In [32]:
df = merge_df[merge_df.측정소명 == '중랑']
df

Unnamed: 0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
134543,2023-01-01 01:00:00,중랑,99.0,0.6,1.2,67.0,0.0,54.0
134544,2023-01-01 02:00:00,중랑,229.0,0.7,1.8,65.0,0.0,60.0
134545,2023-01-01 03:00:00,중랑,36.0,1.0,0.5,71.0,0.0,65.0
134546,2023-01-01 04:00:00,중랑,158.0,0.6,0.8,69.0,0.0,62.0
134547,2023-01-01 05:00:00,중랑,7.0,0.9,0.8,48.0,0.0,53.0
...,...,...,...,...,...,...,...,...
516883,2021-12-31 19:00:00,중랑,299.0,1.5,-5.5,27.0,0.0,7.0
516884,2021-12-31 20:00:00,중랑,339.0,1.6,-5.9,27.0,0.0,8.0
516885,2021-12-31 21:00:00,중랑,347.0,1.8,-6.5,29.0,0.0,7.0
516886,2021-12-31 22:00:00,중랑,12.0,1.7,-7.0,29.0,0.0,7.0


In [33]:
dt_missing = df_range[~df_range.전체일시.isin(df.일시)]
dt_missing

Unnamed: 0,전체일시
0,2021-01-01 00:00:00
1264,2021-02-22 16:00:00
1265,2021-02-22 17:00:00
2080,2021-03-28 16:00:00
2081,2021-03-28 17:00:00
...,...
23436,2023-09-04 12:00:00
23437,2023-09-04 13:00:00
23438,2023-09-04 14:00:00
23439,2023-09-04 15:00:00


In [34]:
pd.DataFrame({
    '컬럼명':[0, 1, 2],
    '컬럼명2':[2, 3, 4]
})

Unnamed: 0,컬럼명,컬럼명2
0,0,2
1,1,3
2,2,4


In [35]:
import numpy as np
df_missing = pd.DataFrame({ '일시' : list(dt_missing.전체일시),
               '측정소명' : ['중랑' for i in dt_missing.전체일시], 
               '풍향': [np.nan  for i in dt_missing.전체일시], 
               '풍속': [np.nan  for i in dt_missing.전체일시],
               '습도' : [np.nan  for i in dt_missing.전체일시], 
               '강수량':[np.nan  for i in dt_missing.전체일시],
               'PM' : [np.nan  for i in dt_missing.전체일시]
})
df_missing

Unnamed: 0,일시,측정소명,풍향,풍속,습도,강수량,PM
0,2021-01-01 00:00:00,중랑,,,,,
1,2021-02-22 16:00:00,중랑,,,,,
2,2021-02-22 17:00:00,중랑,,,,,
3,2021-03-28 16:00:00,중랑,,,,,
4,2021-03-28 17:00:00,중랑,,,,,
...,...,...,...,...,...,...,...
661,2023-09-04 12:00:00,중랑,,,,,
662,2023-09-04 13:00:00,중랑,,,,,
663,2023-09-04 14:00:00,중랑,,,,,
664,2023-09-04 15:00:00,중랑,,,,,


In [36]:
df = pd.concat([df, df_missing])
df.set_index(df.일시, drop= True, inplace= True)
df

Unnamed: 0_level_0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-01 01:00:00,2023-01-01 01:00:00,중랑,99.0,0.6,1.2,67.0,0.0,54.0
2023-01-01 02:00:00,2023-01-01 02:00:00,중랑,229.0,0.7,1.8,65.0,0.0,60.0
2023-01-01 03:00:00,2023-01-01 03:00:00,중랑,36.0,1.0,0.5,71.0,0.0,65.0
2023-01-01 04:00:00,2023-01-01 04:00:00,중랑,158.0,0.6,0.8,69.0,0.0,62.0
2023-01-01 05:00:00,2023-01-01 05:00:00,중랑,7.0,0.9,0.8,48.0,0.0,53.0
...,...,...,...,...,...,...,...,...
2023-09-04 12:00:00,2023-09-04 12:00:00,중랑,,,,,,
2023-09-04 13:00:00,2023-09-04 13:00:00,중랑,,,,,,
2023-09-04 14:00:00,2023-09-04 14:00:00,중랑,,,,,,
2023-09-04 15:00:00,2023-09-04 15:00:00,중랑,,,,,,


In [37]:
df.sort_index(inplace = True)
df

Unnamed: 0_level_0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01 00:00:00,2021-01-01 00:00:00,중랑,,,,,,
2021-01-01 01:00:00,2021-01-01 01:00:00,중랑,149.0,0.7,-9.1,58.0,0.0,12.0
2021-01-01 02:00:00,2021-01-01 02:00:00,중랑,146.0,0.9,-9.3,60.0,0.0,13.0
2021-01-01 03:00:00,2021-01-01 03:00:00,중랑,119.0,0.6,-9.2,61.0,0.0,11.0
2021-01-01 04:00:00,2021-01-01 04:00:00,중랑,101.0,0.7,-9.3,63.0,0.0,12.0
...,...,...,...,...,...,...,...,...
2023-09-30 19:00:00,2023-09-30 19:00:00,중랑,285.0,1.1,21.7,87.0,0.0,21.0
2023-09-30 20:00:00,2023-09-30 20:00:00,중랑,284.0,1.4,21.2,91.0,0.0,26.0
2023-09-30 21:00:00,2023-09-30 21:00:00,중랑,279.0,1.2,20.5,89.0,0.0,23.0
2023-09-30 22:00:00,2023-09-30 22:00:00,중랑,301.0,1.7,20.2,59.0,0.0,23.0


In [38]:
print(df.풍향.isnull().sum())
df.풍향.interpolate(method='linear' , inplace= True)
print(df.풍향.isnull().sum())

679
1


In [39]:
df.풍속.interpolate(method='linear', inplace= True)
df.온도.interpolate(method='linear', inplace= True)
df.습도.interpolate(method='linear', inplace= True)
df.강수량.interpolate(method='linear', inplace= True)
df.PM.interpolate(method='linear', inplace= True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24072 entries, 2021-01-01 00:00:00 to 2023-09-30 23:00:00
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   일시      24072 non-null  object 
 1   측정소명    24072 non-null  object 
 2   풍향      24071 non-null  float64
 3   풍속      24071 non-null  float64
 4   온도      24071 non-null  float64
 5   습도      24071 non-null  float64
 6   강수량     24071 non-null  float64
 7   PM      24071 non-null  float64
dtypes: float64(6), object(2)
memory usage: 1.7+ MB


In [40]:
df[df.일시.str.contains('2023-09-04')]

Unnamed: 0_level_0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-09-04 00:00:00,2023-09-04 00:00:00,중랑,127.0,1.0,25.2,94.0,0.0,7.0
2023-09-04 01:00:00,2023-09-04 01:00:00,중랑,69.0,1.1,25.1,96.0,0.0,8.0
2023-09-04 02:00:00,2023-09-04 02:00:00,중랑,27.0,0.9,25.0,97.0,0.0,10.0
2023-09-04 03:00:00,2023-09-04 03:00:00,중랑,62.0,0.9,24.9,96.0,0.0,9.0
2023-09-04 04:00:00,2023-09-04 04:00:00,중랑,66.0,1.2,24.4,100.0,0.0,11.0
2023-09-04 05:00:00,2023-09-04 05:00:00,중랑,206.0,1.0,23.9,100.0,0.0,10.0
2023-09-04 06:00:00,2023-09-04 06:00:00,중랑,36.0,1.4,24.1,100.0,0.0,6.0
2023-09-04 07:00:00,2023-09-04 07:00:00,중랑,29.0,1.1,24.3,100.0,0.0,7.0
2023-09-04 08:00:00,2023-09-04 08:00:00,중랑,9.0,1.6,25.2,98.0,0.0,5.0
2023-09-04 09:00:00,2023-09-04 09:00:00,중랑,10.0,1.7,26.1,92.0,0.0,5.0


In [41]:
# 일시	측정소명	풍향	풍속	온도	습도	강수량	PM
import numpy as np

total_list = []

for k in pd.unique(merge_df.측정소명):
    df = merge_df[merge_df.측정소명 == k]
    dt_missing = df_range[~df_range.전체일시.isin(df.일시)]
    df_missing = pd.DataFrame({ '일시' : list(dt_missing.전체일시),
                   '측정소명' : [k for i in dt_missing.전체일시], 
                   '풍향': [np.nan  for i in dt_missing.전체일시], 
                   '풍속': [np.nan  for i in dt_missing.전체일시],
                   '습도' : [np.nan  for i in dt_missing.전체일시], 
                   '강수량':[np.nan  for i in dt_missing.전체일시],
                   'PM' : [np.nan  for i in dt_missing.전체일시]
    })
    df = pd.concat([df, df_missing])
    df.set_index(df.일시, drop= True, inplace= True)

    df.sort_index(inplace = True)
    
    df.풍향.interpolate(method='linear', inplace= True)
    df.풍속.interpolate(method='linear', inplace= True)
    df.온도.interpolate(method='linear', inplace= True)
    df.습도.interpolate(method='linear', inplace= True)
    df.강수량.interpolate(method='linear', inplace= True)
    df.PM.interpolate(method='linear', inplace= True)
    total_list.append(df)

In [42]:
total_df = pd.concat(total_list)

In [43]:
total_df.reset_index(drop=True, inplace = True)

In [44]:
total_df

Unnamed: 0,일시,측정소명,풍향,풍속,온도,습도,강수량,PM
0,2021-01-01 00:00:00,강남,,,,,,
1,2021-01-01 01:00:00,강남,327.0,1.3,-7.0,55.0,0.0,13.0
2,2021-01-01 02:00:00,강남,309.0,1.1,-6.8,55.0,0.0,13.0
3,2021-01-01 03:00:00,강남,45.0,0.9,-7.0,57.0,0.0,12.0
4,2021-01-01 04:00:00,강남,304.0,0.6,-7.5,57.0,0.0,13.0
...,...,...,...,...,...,...,...,...
529579,2023-09-30 19:00:00,중랑,285.0,1.1,21.7,87.0,0.0,21.0
529580,2023-09-30 20:00:00,중랑,284.0,1.4,21.2,91.0,0.0,26.0
529581,2023-09-30 21:00:00,중랑,279.0,1.2,20.5,89.0,0.0,23.0
529582,2023-09-30 22:00:00,중랑,301.0,1.7,20.2,59.0,0.0,23.0


In [45]:
total_df.dropna(inplace = True)

In [46]:
total_df.to_pickle('./data/total_df.pkl')

In [47]:
total_df.to_csv('./data/total_df.csv')