In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [40]:
def datetime(train, aws_train, type_num):
    # merge
    train = pd.merge(train, aws_train, left_on=['일시', '연도'], right_on=['일시', '연도'], how='left')
    train['type'] = type_num
    train.drop(['지점', '측정소'], axis=1, inplace=True)

    # 날짜형으로 변경
    year = {
        0: '2017-',
        1: '2018-',
        2: '2019-',
        3: '2020-'
    }
    train['연도'] = train['연도'].replace(year)
    train['날짜'] = train['연도'] + train['일시']
    train.drop(['연도', '일시'], axis=1, inplace=True)

    return train


def datetime_split(train):
    from datetime import datetime, time

    year = []
    month = []
    day = []
    hour = []
    min = []
    for i in range(len(train)):
        date_string = train['날짜'][i]

        # 문자열을 datetime 객체로 변환
        datetime_obj = pd.to_datetime(date_string)

        year.append(datetime_obj.year)
        month.append(datetime_obj.month)
        day.append(datetime_obj.day)
        hour.append(datetime_obj.hour)
        min.append(datetime_obj.minute)

    train['년'] = year
    train['월'] = month
    train['일'] = day
    train['시'] = hour
    
    train = train.sort_values('날짜')
    train.drop(['날짜'], axis=1, inplace=True)

    return train

In [41]:
# 결측치 처리
def imputer(train):
    from sklearn.impute import KNNImputer

    #임퓨터 선언(5개의 평균으로 계산하겠다)
    imputer = KNNImputer(n_neighbors=5)

    #임퓨터를 사용하여 filled_train으로 저장 이후 같은 임퓨터를 사용할때는 imputer.transform()으로 사용하면됨
    filled_train = imputer.fit_transform(train)

    #사용하면 array값으로 나오기때문에 dataframe으로 바꿔주고 컬럼을가져옴
    train = pd.DataFrame(filled_train, columns=train.columns) 
    
    return train

In [49]:
# 위 3가지 통틀어서
def preprocessing(train, aws_train, type_num):
    train = datetime(train, aws_train, type_num)
    train = datetime_split(train)
    train = imputer(train)

    return train

# 1. train 데이터 전처리 

In [50]:
gongju = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/공주.csv')
noeun = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/노은동.csv')
nonsan = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/논산.csv')
daecheon = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/대천2동.csv')
dokgog = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/독곶리.csv')
dongmoon = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/동문동.csv')
mozong = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/모종동.csv')
moonchang = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/문창동.csv')
seongseong = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/성성동.csv')
sinbang = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/신방동.csv')
sinheung = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/신흥동.csv')
areum = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/아름동.csv')
yesan = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/예산군.csv')
emnae = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/읍내동.csv')
ewon = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/이원면.csv')
junglim = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/정림동.csv')
hongseong = pd.read_csv('/content/drive/MyDrive/pm/TRAIN/홍성읍.csv')

In [51]:
gongju_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/공주.csv')
noeun_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/계룡.csv')
nonsan_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/논산.csv')
daecheon_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/대천항.csv')
dokgog_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/대산.csv')
dongmoon_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/태안.csv')
mozong_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/아산.csv')
moonchang_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/오월드.csv')
seongseong_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/성거.csv')
sinbang_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/성거.csv')
sinheung_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/세종연서.csv')
areum_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/세종고운.csv')
yesan_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/예산.csv')
emnae_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/장동.csv')
ewon_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/태안.csv')
junglim_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/오월드.csv')
hongseong_aws = pd.read_csv('/content/drive/MyDrive/pm/TRAIN_AWS/홍북.csv')

In [52]:
gongju_train = preprocessing(gongju, gongju_aws, 0)
noeun_train = preprocessing(noeun, noeun_aws, 1)
nonsan_train = preprocessing(nonsan, nonsan_aws, 2)
daecheon_train = preprocessing(daecheon, daecheon_aws, 3)
dokgog_train = preprocessing(dokgog, dokgog_aws, 4)
dongmoon_train = preprocessing(dongmoon, dongmoon_aws, 5)
mozong_train = preprocessing(mozong, mozong_aws, 6)
moonchang_train = preprocessing(moonchang, moonchang_aws, 7)
seongseong_train = preprocessing(seongseong, seongseong_aws, 8)
sinbang_train = preprocessing(sinbang, sinbang_aws, 9)
sinheung_train = preprocessing(sinheung, sinheung_aws, 10)
areum_train = preprocessing(areum, areum_aws, 11)
yesan_train = preprocessing(yesan, yesan_aws, 12)
emnae_train = preprocessing(emnae, emnae_aws, 13)
ewon_train = preprocessing(ewon, ewon_aws, 14)
junglim_train = preprocessing(junglim, junglim_aws, 15)
hongseong_train = preprocessing(hongseong, hongseong_aws, 16)

# 2. train 데이터 ML 

In [55]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [59]:
def modeling(train):
    from catboost import CatBoostRegressor, Pool
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split

    x = train.drop(['PM2.5'], axis=1)
    y = train.loc[:, 'PM2.5']

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)

    model = CatBoostRegressor(learning_rate=0.1, depth=6, iterations=100)
    #train_data = Pool(x_train, y_train, cat_features=categorical_features_indices)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)

    # 모델 평가 (예: 평균 제곱 오차)
    mse = mean_squared_error(y_val, y_pred)
    print(mse)

    return model

In [60]:
model_gongju = modeling(gongju_train)
model_noeun = modeling(noeun_train)
model_nonsan = modeling(nonsan_train)
model_daecheon = modeling(daecheon_train)
model_dokgog = modeling(dokgog_train)
model_dongmoon = modeling(dongmoon_train)
model_mozong = modeling(mozong_train)
model_moonchang = modeling(moonchang_train)
model_seongseong = modeling(seongseong_train)
model_sinbang = modeling(sinbang_train)
model_sinheung = modeling(sinheung_train)
model_areum = modeling(areum_train)
model_yesan = modeling(yesan_train)
model_emnae = modeling(emnae_train)
model_ewon = modeling(ewon_train)
model_junglim = modeling(junglim_train)
model_hongseong = modeling(hongseong_train)

0:	learn: 0.0707121	total: 18.9ms	remaining: 1.87s
1:	learn: 0.0695188	total: 37.2ms	remaining: 1.82s
2:	learn: 0.0684675	total: 52.5ms	remaining: 1.7s
3:	learn: 0.0674915	total: 68.6ms	remaining: 1.65s
4:	learn: 0.0667270	total: 90.3ms	remaining: 1.72s
5:	learn: 0.0659937	total: 105ms	remaining: 1.64s
6:	learn: 0.0653153	total: 124ms	remaining: 1.65s
7:	learn: 0.0647675	total: 144ms	remaining: 1.66s
8:	learn: 0.0642457	total: 163ms	remaining: 1.65s
9:	learn: 0.0638336	total: 177ms	remaining: 1.59s
10:	learn: 0.0634976	total: 195ms	remaining: 1.58s
11:	learn: 0.0629911	total: 220ms	remaining: 1.61s
12:	learn: 0.0625199	total: 235ms	remaining: 1.57s
13:	learn: 0.0621946	total: 253ms	remaining: 1.55s
14:	learn: 0.0618856	total: 273ms	remaining: 1.55s
15:	learn: 0.0615301	total: 303ms	remaining: 1.59s
16:	learn: 0.0612970	total: 319ms	remaining: 1.56s
17:	learn: 0.0609284	total: 339ms	remaining: 1.54s
18:	learn: 0.0606215	total: 352ms	remaining: 1.5s
19:	learn: 0.0603513	total: 374ms	rema

# 3. test 전처리

In [63]:
def datetime(test, aws_test, type_num):
    # merge
    test = pd.merge(test, aws_test, left_on=['일시', '연도'], right_on=['일시', '연도'], how='left')
    test['type'] = type_num
    test.drop(['지점', '측정소'], axis=1, inplace=True)

    # 날짜형으로 변경
    year = {
    4: '2021-'
    }
    test['연도'] = test['연도'].replace(year)
    test['날짜'] = test['연도'] + test['일시']
    test.drop(['연도', '일시'], axis=1, inplace=True)

    return test


def datetime_split(test):
    from datetime import datetime, time

    year = []
    month = []
    day = []
    hour = []
    min = []
    for i in range(len(test)):
        date_string = test['날짜'][i]

        # 문자열을 datetime 객체로 변환
        datetime_obj = pd.to_datetime(date_string)

        year.append(datetime_obj.year)
        month.append(datetime_obj.month)
        day.append(datetime_obj.day)
        hour.append(datetime_obj.hour)
        min.append(datetime_obj.minute)

    test['년'] = year
    test['월'] = month
    test['일'] = day
    test['시'] = hour
    
    test = test.sort_values('날짜')
    test.drop(['날짜'], axis=1, inplace=True)

    return test

In [64]:
# 결측치 처리
def imputer(test):
    from sklearn.impute import KNNImputer

    y_test = test.iloc[:, 0]
    test = test.drop(['PM2.5'], axis=1)

    #임퓨터 선언(5개의 평균으로 계산하겠다)
    imputer = KNNImputer(n_neighbors=5)

    #임퓨터를 사용하여 filled_train으로 저장 이후 같은 임퓨터를 사용할때는 imputer.transform()으로 사용하면됨
    filled_test = imputer.fit_transform(test)

    #사용하면 array값으로 나오기때문에 dataframe으로 바꿔주고 컬럼을가져옴
    test = pd.DataFrame(filled_test, columns=test.columns) 
    
    return test, y_test

In [73]:
# 위 3가지 통틀어서
def preprocessing(test, aws_test, type_num):
    test = datetime(test, aws_test, type_num)
    test = datetime_split(test)
    test, y_test = imputer(test)

    return test, y_test

In [74]:
gongju_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/공주.csv')
noeun_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/노은동.csv')
nonsan_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/논산.csv')
daecheon_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/대천2동.csv')
dokgog_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/독곶리.csv')
dongmoon_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/동문동.csv')
mozong_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/모종동.csv')
moonchang_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/문창동.csv')
seongseong_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/성성동.csv')
sinbang_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/신방동.csv')
sinheung_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/신흥동.csv')
areum_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/아름동.csv')
yesan_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/예산군.csv')
emnae_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/읍내동.csv')
ewon_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/이원면.csv')
junglim_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/정림동.csv')
hongseong_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_INPUT/홍성읍.csv')

In [75]:
gongju_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/공주.csv')
noeun_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/계룡.csv')
nonsan_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/논산.csv')
daecheon_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/대천항.csv')
dokgog_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/대산.csv')
dongmoon_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/태안.csv')
mozong_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/아산.csv')
moonchang_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/오월드.csv')
seongseong_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/성거.csv')
sinbang_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/성거.csv')
sinheung_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/세종연서.csv')
areum_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/세종고운.csv')
yesan_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/예산.csv')
emnae_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/장동.csv')
ewon_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/태안.csv')
junglim_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/오월드.csv')
hongseong_aws_test = pd.read_csv('/content/drive/MyDrive/pm/TEST_AWS/홍북.csv')

In [76]:
gongju_test, gongju_test_y = preprocessing(gongju_test, gongju_aws_test, 0)
noeun_test, noeun_test_y = preprocessing(noeun_test, noeun_aws_test, 1)
nonsan_test, nonsan_test_y = preprocessing(nonsan_test, nonsan_aws_test, 2)
daecheon_test, daecheon_test_y = preprocessing(daecheon_test, daecheon_aws_test, 3)
dokgog_test, dokgog_test_y = preprocessing(dokgog_test, dokgog_aws_test, 4)
dongmoon_test, dongmoon_test_y = preprocessing(dongmoon_test, dongmoon_aws_test, 5)
mozong_test, mozong_test_y = preprocessing(mozong_test, mozong_aws_test, 6)
moonchang_test, moonchang_test_y = preprocessing(moonchang_test, moonchang_aws_test, 7)
seongseong_test, seongseong_test_y = preprocessing(seongseong_test, seongseong_aws_test, 8)
sinbang_test, sinbang_test_y = preprocessing(sinbang_test, sinbang_aws_test, 9)
sinheung_test, sinheung_test_y = preprocessing(sinheung_test, sinheung_aws_test, 10)
areum_test, areum_test_y = preprocessing(areum_test, areum_aws_test, 11)
yesan_test, yesan_test_y = preprocessing(yesan_test, yesan_aws_test, 12)
emnae_test, emnae_test_y = preprocessing(emnae_test, emnae_aws_test, 13)
ewon_test, ewon_test_y = preprocessing(ewon_test, ewon_aws_test, 14)
junglim_test, junglim_test_y = preprocessing(junglim_test, junglim_aws_test, 15)
hongseong_test, hongseong_test_y = preprocessing(hongseong_test, hongseong_aws_test, 16)

# 4. test 예측

In [78]:
def modeling2(model, test):
    y_pred = model.predict(test)
    return y_pred

In [80]:
y_pred_gongju = modeling2(model_gongju, gongju_test)
y_pred_noeun = modeling2(model_noeun, noeun_test)
y_pred_nonsan = modeling2(model_nonsan, nonsan_test)
y_pred_daecheon = modeling2(model_daecheon, daecheon_test)
y_pred_dokgog = modeling2(model_dokgog, dokgog_test)
y_pred_dongmoon = modeling2(model_dongmoon, dongmoon_test)
y_pred_mozong = modeling2(model_mozong, mozong_test)
y_pred_moonchang = modeling2(model_moonchang, moonchang_test)
y_pred_seongseong = modeling2(model_seongseong, seongseong_test)
y_pred_sinbang = modeling2(model_sinbang, sinbang_test)
y_pred_sinheung = modeling2(model_sinheung, sinheung_test)
y_pred_areum = modeling2(model_areum, areum_test)
y_pred_yesan = modeling2(model_yesan, yesan_test)
y_pred_emnae = modeling2(model_emnae, emnae_test)
y_pred_ewon = modeling2(model_ewon, ewon_test)
y_pred_junglim = modeling2(model_junglim, junglim_test)
y_pred_hongseong = modeling2(model_hongseong, hongseong_test)

# 5. 저장 

In [114]:
answer = pd.read_csv('/content/drive/MyDrive/pm/answer_sample.csv')

In [109]:
def save(region, test_y, y_pred):
    pred_df = pd.DataFrame(test_y)
    pred_df['pred'] = y_pred
    pred_df = pred_df[pred_df.isnull().any(axis=1)].reset_index(drop=True)
    pred_df = pred_df.drop(['PM2.5'], axis=1)
    answer.loc[answer['측정소'] == region, 'PM2.5'] = pred_df.values

In [110]:
save('공주', gongju_test_y, y_pred_gongju)
save('노은동', noeun_test_y, y_pred_noeun)
save('논산', nonsan_test_y, y_pred_nonsan)
save('대천2동', daecheon_test_y, y_pred_daecheon)
save('독곶리', dokgog_test_y, y_pred_dokgog)
save('동문동', dongmoon_test_y, y_pred_dongmoon)
save('모종동', mozong_test_y, y_pred_mozong)
save('문창동', moonchang_test_y, y_pred_moonchang)
save('성성동', seongseong_test_y, y_pred_seongseong)
save('신방동', sinbang_test_y, y_pred_sinbang)
save('신흥동', sinheung_test_y, y_pred_sinheung)
save('아름동', areum_test_y, y_pred_areum)
save('예산군', yesan_test_y, y_pred_yesan)
save('읍내동', emnae_test_y, y_pred_emnae)
save('이원면', ewon_test_y, y_pred_ewon)
save('정림동', junglim_test_y, y_pred_junglim)
save('홍성읍', hongseong_test_y, y_pred_hongseong)

In [111]:
answer[answer.isnull().any(axis=1)]

Unnamed: 0,연도,일시,측정소,PM2.5


In [113]:
answer.to_csv('knn_cat.csv', index=False, encoding='utf-8')