In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
fog_train = pd.read_csv('/Users/yunkihun/weather-contest/fog_python/fog_train.csv')
fog_test = pd.read_csv('/Users/yunkihun/weather-contest/fog_python/fog_test.csv')

In [107]:
# 전처리

df = fog_train.drop(columns=[ 
                "Unnamed: 0",
                "fog_train.year", 
                #"fog_train.month", 
                "fog_train.day",
                #"fog_train.time",
                "fog_train.minute",
                "fog_train.vis1"
                ])

df = df.replace(-99.00, np.NaN)
df = df.replace(-99.90, np.NaN)

fog_train_df = fog_train.replace(-99.00, np.NaN)
fog_train_df = fog_train.replace(-99.90, np.NaN)

columns = df.columns
columns = [column.replace('fog_train.', "") for column in columns]
df.columns = columns

columns = fog_train_df.columns
columns = [column.replace('fog_train.', "") for column in columns]
fog_train_df.columns = columns

df = df.dropna()
fog_train_df = fog_train_df.dropna()

def stn_transform(value):
    if value.startswith("A"):
        return 1
    elif value.startswith("B"):
        return 2
    elif value.startswith("C"):
        return 3
    elif value.startswith("D"):
        return 4
    elif value.startswith("E"):
        return 5
    
df["stn_id"] = df["stn_id"].apply(stn_transform)

df["temp_diff"] = df["ta"] - df["ts"]
df['5hr_chg'] = df['ta'] - df['ta'].shift(5)
df = df.iloc[5:]

df.shape

(2994845, 13)

In [11]:
# CM 함수
def csi_index(cm):

     H = (cm[0][0] + cm[1][1] + cm[2][2])
     F = (cm[0][1] + cm[0][2] + 
          cm[1][0] + cm[1][2] + 
          cm[2][0] + cm[2][1] +
          cm[3][0] + cm[3][1] + cm[3][2])
     M = (cm[0][3] + cm[1][3] + cm[2][3])

     knn_csi = H / (H + F + M)
     return knn_csi

In [4]:
# random forest 함수

def random_forest(df):
    feature_columns = list(df.columns.difference(["class"]))
    X = df[feature_columns]
    y=df["class"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    rf_model.fit(X_train, y_train)
    
    y_pred = rf_model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(csi_index(cm))

In [7]:
# 분산 팽창 지수

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    return vif_data

# 시간/지표 기온차 변수 추가

In [5]:
df.corr()

Unnamed: 0,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,class
stn_id,1.0,-0.061468,0.254918,0.027946,-0.023674,0.023762,0.004652,0.02872,-0.00853
ws10_deg,-0.061468,1.0,0.200242,-0.068713,-0.047907,-0.225108,0.08637,-0.044935,0.022142
ws10_ms,0.254918,0.200242,1.0,0.011221,0.007804,-0.292699,0.218815,0.073004,0.047796
ta,0.027946,-0.068713,0.011221,1.0,0.048411,0.142129,0.272359,0.915636,0.031992
re,-0.023674,-0.047907,0.007804,0.048411,1.0,0.263228,-0.124162,0.016516,-0.041112
hm,0.023762,-0.225108,-0.292699,0.142129,0.263228,1.0,-0.430115,0.007288,-0.120866
sun10,0.004652,0.08637,0.218815,0.272359,-0.124162,-0.430115,1.0,0.550451,0.051728
ts,0.02872,-0.044935,0.073004,0.915636,0.016516,0.007288,0.550451,1.0,0.030398
class,-0.00853,0.022142,0.047796,0.031992,-0.041112,-0.120866,0.051728,0.030398,1.0


In [8]:
calculate_vif(df)

Unnamed: 0,feature,VIF
0,stn_id,3.993145
1,ws10_deg,4.320963
2,ws10_ms,3.180609
3,ta,36.428766
4,re,1.150779
5,hm,15.931263
6,sun10,4.78327
7,ts,44.426062
8,class,25.366874


In [97]:
# https://www.notion.so/734ee170e8d94533a1707405cfe9f8d2
# 기온/지면온도 온도차, 5시간 기온 변화량을 적용해볼 필요가 있어 보인다.
# 기온/지면 온도차
df["temp_diff"] = df["ta"] - df["ts"]

In [98]:
# 5시간 기온 차 (현재시간 기온 - 5시간 전 시간 기온)
df['5hr_chg'] = df['ta'] - df['ta'].shift(5)

In [99]:
# 맨 앞의 5개 데이터는 어쩔 수 없이 지우자
df = df.iloc[5:]

In [100]:
df

Unnamed: 0,month,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,class,temp_diff,5hr_chg
5,1,1,43.1,1.7,-5.9,0.0,40.1,0.0,-2.4,4.0,-3.5,0.5
6,1,1,36.4,1.0,-5.9,0.0,40.0,0.0,-2.5,4.0,-3.4,0.4
7,1,1,44.3,0.6,-5.8,0.0,40.7,0.0,-2.5,4.0,-3.3,0.5
8,1,1,25.0,0.1,-5.8,0.0,41.7,0.0,-2.5,4.0,-3.3,0.4
9,1,1,29.8,0.1,-5.7,0.0,41.4,0.0,-2.5,4.0,-3.2,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...
3156454,12,5,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,4.0,3.4,0.2
3156455,12,5,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,4.0,3.6,0.0
3156456,12,5,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,4.0,3.6,-0.1
3156457,12,5,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,4.0,3.7,-0.2


In [38]:
df.corr()

Unnamed: 0,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,class,temp_diff,5hr_chg
stn_id,1.0,-0.061471,0.254917,0.027943,-0.023675,0.02376,0.004651,0.028718,-0.00853,-0.014924,5.5e-05
ws10_deg,-0.061471,1.0,0.20024,-0.068719,-0.047907,-0.225112,0.086369,-0.04494,0.022142,-0.024904,-0.004358
ws10_ms,0.254917,0.20024,1.0,0.011217,0.007803,-0.292702,0.218814,0.073001,0.047797,-0.15343,-0.009955
ta,0.027943,-0.068719,0.011217,1.0,0.04841,0.142125,0.272358,0.915636,0.031992,-0.265205,0.047866
re,-0.023675,-0.047907,0.007803,0.04841,1.0,0.263228,-0.124163,0.016515,-0.041112,0.053864,-0.067984
hm,0.02376,-0.225112,-0.292702,0.142125,0.263228,1.0,-0.430117,0.007285,-0.120866,0.256956,-0.086446
sun10,0.004651,0.086369,0.218814,0.272358,-0.124163,-0.430117,1.0,0.550451,0.051728,-0.794329,0.521658
ts,0.028718,-0.04494,0.073001,0.915636,0.016515,0.007285,0.550451,1.0,0.030398,-0.630444,0.192618
class,-0.00853,0.022142,0.047797,0.031992,-0.041112,-0.120866,0.051728,0.030398,1.0,-0.011135,0.01415
temp_diff,-0.014924,-0.024904,-0.15343,-0.265205,0.053864,0.256956,-0.794329,-0.630444,-0.011135,1.0,-0.369558


In [39]:
calculate_vif(df)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,VIF
0,stn_id,3.995046
1,ws10_deg,4.322989
2,ws10_ms,3.220996
3,ta,inf
4,re,1.151679
5,hm,16.579749
6,sun10,6.134028
7,ts,inf
8,class,26.151893
9,temp_diff,inf


In [40]:
random_forest(df)

[[   657    208     10   1076]
 [   159    822    116   1897]
 [    13    178    336   2458]
 [    31    121    127 740503]]
0.22109879400657814


# 계절 / 월 추가

In [94]:
# 안개도 계절에 영향을 많이 받는 것 같으니, 계절 변수를 추가해보자

def season(df):
    conditions = [
        (df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2),
        (df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5),
        (df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8),
        (df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11)
    ]

    choices = [1, 2, 3, 4]

    df['season'] = np.select(conditions, choices, default='Unknown')

In [101]:
season(df)

In [102]:
df = df.drop(columns=["month"])

In [103]:
random_forest(df)

[[   723    237      9    982]
 [   175    889    124   1806]
 [    18    183    350   2434]
 [    35    125    131 740491]]
0.23865709767668167


In [105]:
# 이번엔 계절을 추가하지 않고, raw month data로 돌려보자
df

Unnamed: 0,month,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,class,temp_diff,5hr_chg
5,1,1,43.1,1.7,-5.9,0.0,40.1,0.0,-2.4,4.0,-3.5,0.5
6,1,1,36.4,1.0,-5.9,0.0,40.0,0.0,-2.5,4.0,-3.4,0.4
7,1,1,44.3,0.6,-5.8,0.0,40.7,0.0,-2.5,4.0,-3.3,0.5
8,1,1,25.0,0.1,-5.8,0.0,41.7,0.0,-2.5,4.0,-3.3,0.4
9,1,1,29.8,0.1,-5.7,0.0,41.4,0.0,-2.5,4.0,-3.2,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...
3156454,12,5,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,4.0,3.4,0.2
3156455,12,5,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,4.0,3.6,0.0
3156456,12,5,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,4.0,3.6,-0.1
3156457,12,5,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,4.0,3.7,-0.2


In [106]:
random_forest(df)

[[   791    240     14    906]
 [   183    958    138   1715]
 [    17    207    380   2381]
 [    35    119    144 740484]]
0.258750607681089


In [110]:
# 이번엔 time 도 되살려서 해보자
df

Unnamed: 0,month,time,stn_id,ws10_deg,ws10_ms,ta,re,hm,sun10,ts,class,temp_diff,5hr_chg
5,1,1,1,43.1,1.7,-5.9,0.0,40.1,0.0,-2.4,4.0,-3.5,0.5
6,1,1,1,36.4,1.0,-5.9,0.0,40.0,0.0,-2.5,4.0,-3.4,0.4
7,1,1,1,44.3,0.6,-5.8,0.0,40.7,0.0,-2.5,4.0,-3.3,0.5
8,1,1,1,25.0,0.1,-5.8,0.0,41.7,0.0,-2.5,4.0,-3.3,0.4
9,1,1,1,29.8,0.1,-5.7,0.0,41.4,0.0,-2.5,4.0,-3.2,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156454,12,23,5,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,4.0,3.4,0.2
3156455,12,23,5,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,4.0,3.6,0.0
3156456,12,23,5,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,4.0,3.6,-0.1
3156457,12,23,5,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,4.0,3.7,-0.2


In [109]:
random_forest(df)

[[   832    260     15    844]
 [   189   1036    131   1638]
 [    15    220    405   2345]
 [    18    100    158 740506]]
0.2769924445527663
