In [16]:
import pandas as pd
import numpy as np
import os, sys, time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [2]:
df = pd.read_csv('FZ_Data/FZ_all.csv', 
                 header=0,
                 encoding='euc-kr',
                 converters={'지점':str, '지면상태(지면상태코드)':str, '현상번호(국내식)':str})

In [3]:
interest_columns = ['지점', '지점명', '일시', '기온(°C)', '강수량(mm)', 
       '풍속(m/s)', '풍향(16방위)', '습도(%)', 
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)',
       '일조(hr)', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)',
       '전운량(10분위)', '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)',
       '지면상태(지면상태코드)', '현상번호(국내식)', '지면온도(°C)',
       'FZ_flag']

df = df[interest_columns]

In [4]:
df.rename(columns = {'지점' : 'Office',
                      '지점명' : 'Office_Name',
                      '일시' : 'Date',
                      '기온(°C)': 'Temp', 
                      '강수량(mm)': 'Rain', 
                      '풍속(m/s)': 'WindSpeed', 
                      '풍향(16방위)': 'WindDir',  
                      '습도(%)': 'Moist',
                      '증기압(hPa)': 'hPa', 
                      '이슬점온도(°C)': 'DewTemp', 
                      '현지기압(hPa)': 'CurhPa', 
                      '해면기압(hPa)': 'SeahPa',
                      '일조(hr)': 'Daylight', 
                      '일사(MJ/m2)': 'DaylightMJ', 
                      '적설(cm)': 'SnowCm', 
                      '3시간신적설(cm)': 'Snow3hr',
                      '전운량(10분위)': 'Clouds_10', 
                      '중하층운량(10분위)': 'MClouds_10',                      
                      '운형(운형약어)' : 'CloudDesigns_Abb',                     
                      '최저운고(100m )': 'HClouds_100m', 
                      '시정(10m)': 'Visibility_10m', 
                      '지면상태(지면상태코드)': 'GroundState_Code',
                      '현상번호(국내식)': 'PhenomenaNo',
                      '지면온도(°C)': 'SurfaceTemp'
                     }, inplace=True)

In [5]:
df_seoul = df.loc[df['Office_Name']=='서울']

In [6]:
feature_cols = ['Date', 'Temp', 'Rain', 'WindSpeed', 'WindDir',
       'Moist', 'hPa', 'DewTemp', 'CurhPa', 'SeahPa', 'Daylight', 'DaylightMJ',
       'SnowCm', 'Snow3hr', 'Clouds_10', 'MClouds_10', 'CloudDesigns_Abb',
       'HClouds_100m', 'Visibility_10m', 'GroundState_Code', 'PhenomenaNo',
       'SurfaceTemp']
feature_cols_numeric_only = ['Temp', 'Rain', 'WindSpeed', 'WindDir',
       'Moist', 'hPa', 'DewTemp', 'CurhPa', 'SeahPa', 'Daylight', 'DaylightMJ',
       'SnowCm', 'Snow3hr', 'Clouds_10', 'MClouds_10',
       'HClouds_100m', 'Visibility_10m', 
       'SurfaceTemp']

In [7]:
X = df_seoul.loc[:, feature_cols_numeric_only]
y = df_seoul['FZ_flag'].values

In [47]:
df_seoul.shape

(7248, 25)

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit_transform(X)
X = scaler.fit_transform(X)

In [81]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# 모델설정
smote = SMOTE(random_state=1234, sampling_strategy=0.01)

# train데이터를 넣어 복제함
X_resampled, y_resampled = smote.fit_sample(X,list(y))

print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_resampled.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_resampled==0)))

After OverSampling, the shape of train_X: (7313, 18)
After OverSampling, the shape of train_y: (7313,) 

After OverSampling, counts of label '1': 72
After OverSampling, counts of label '0': 7241


In [82]:
X_resampled.shape, y_resampled.shape

((7313, 18), (7313,))

In [83]:
df_X_resampled = pd.DataFrame(data=X_resampled, columns=['Temp', 'Rain', 'WindSpeed', 'WindDir',
       'Moist', 'hPa', 'DewTemp', 'CurhPa', 'SeahPa', 'Daylight', 'DaylightMJ',
       'SnowCm', 'Snow3hr', 'Clouds_10', 'MClouds_10',
       'HClouds_100m', 'Visibility_10m', 
       'SurfaceTemp'])
df_y_resampled = pd.DataFrame(data=y_resampled, columns=['FZ_flag'])


In [84]:
df_X_resampled.shape, df_y_resampled.shape

((7313, 18), (7313, 1))

In [85]:
xx = pd.concat([df_X_resampled, df_y_resampled], axis=1)

In [86]:
xx.shape

(7313, 19)

In [87]:
xx.head(10)

Unnamed: 0,Temp,Rain,WindSpeed,WindDir,Moist,hPa,DewTemp,CurhPa,SeahPa,Daylight,DaylightMJ,SnowCm,Snow3hr,Clouds_10,MClouds_10,HClouds_100m,Visibility_10m,SurfaceTemp,FZ_flag
0,0.471429,0.0,0.0,0.0,0.655172,0.248408,0.564593,0.742459,0.734234,0.0,0.0,0.0,0.0,0.5,0.0,0.076923,0.397993,0.157068,0
1,0.485714,0.0,0.240741,0.75,0.643678,0.248408,0.564593,0.742459,0.734234,0.0,0.0,0.0,0.0,0.5,0.0,0.076923,0.397993,0.157068,0
2,0.485714,0.0,0.074074,0.888889,0.712644,0.248408,0.564593,0.742459,0.734234,0.0,0.0,0.0,0.0,0.5,0.0,0.076923,0.397993,0.157068,0
3,0.482857,0.0,0.157407,0.805556,0.735632,0.292994,0.605263,0.723898,0.713964,0.0,0.0,0.0,0.0,0.6,0.6,0.230769,0.230769,0.319372,0
4,0.482857,0.0,0.12963,0.75,0.747126,0.292994,0.605263,0.723898,0.713964,0.0,0.0,0.0,0.0,0.6,0.6,0.230769,0.230769,0.319372,0
5,0.485714,0.0,0.175926,0.805556,0.770115,0.292994,0.605263,0.723898,0.713964,0.0,0.0,0.0,0.0,0.6,0.6,0.230769,0.230769,0.319372,0
6,0.471429,0.0,0.194444,0.75,0.770115,0.292994,0.610048,0.696056,0.689189,0.0,0.0,0.0,0.0,0.8,0.8,0.230769,0.197324,0.319372,0
7,0.471429,0.0,0.138889,0.75,0.758621,0.292994,0.610048,0.696056,0.689189,0.0,0.0,0.0,0.0,0.8,0.8,0.230769,0.197324,0.319372,0
8,0.465714,0.0,0.175926,0.75,0.781609,0.292994,0.610048,0.696056,0.689189,0.0,0.0,0.0,0.0,0.8,0.8,0.230769,0.197324,0.319372,0
9,0.465714,0.0,0.240741,0.694444,0.781609,0.292994,0.610048,0.700696,0.691441,0.0,0.0,0.0,0.0,1.0,0.5,0.230769,0.197324,0.329843,0


In [88]:
def build_formula(x_list, y):
    x_str = ' + '.join(x_list)
    return '{} ~ {}'.format(y, x_str)

In [89]:
fm_str = build_formula(['Temp', 'Rain', 'WindSpeed', 
                        'Moist', 'hPa', 'DewTemp', 'CurhPa', 
                        'SnowCm', 'Snow3hr', 'Clouds_10', 'MClouds_10',
                        'SurfaceTemp'], 'FZ_flag')

In [90]:
fm_str = build_formula(['Temp', 'Rain', 'WindSpeed', 
                        'Moist', 'hPa', 'DewTemp', 'CurhPa', 
                        'Clouds_10', 'MClouds_10',
                        'SurfaceTemp'], 'FZ_flag')

In [91]:
fm_str

'FZ_flag ~ Temp + Rain + WindSpeed + Moist + hPa + DewTemp + CurhPa + Clouds_10 + MClouds_10 + SurfaceTemp'

In [92]:
model = sm.Logit.from_formula(fm_str, xx)

In [93]:
result = model.fit(maxiter=100)

         Current function value: inf
         Iterations: 100


  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


LinAlgError: Singular matrix

In [94]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                FZ_flag   No. Observations:                 9413
Model:                          Logit   Df Residuals:                     9402
Method:                           MLE   Df Model:                           10
Date:                Wed, 12 Feb 2020   Pseudo R-squ.:                  0.9614
Time:                        16:21:04   Log-Likelihood:                -196.44
converged:                      False   LL-Null:                       -5084.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept    -317.0897   8109.071     -0.039      0.969   -1.62e+04    1.56e+04
Temp          111.3233     10.961     10.157      0.000      89.841     132.806
Rain           19.8241      3.077      6.443    

In [95]:
print(result.summary2())

                           Results: Logit
Model:                Logit              Pseudo R-squared:   0.961   
Dependent Variable:   FZ_flag            AIC:                414.8794
Date:                 2020-02-12 16:21   BIC:                493.5277
No. Observations:     9413               Log-Likelihood:     -196.44 
Df Model:             10                 LL-Null:            -5084.7 
Df Residuals:         9402               LLR p-value:        0.0000  
Converged:            0.0000             Scale:              1.0000  
No. Iterations:       100.0000                                       
---------------------------------------------------------------------
              Coef.    Std.Err.    z    P>|z|     [0.025     0.975]  
---------------------------------------------------------------------
Intercept   -317.0897 8109.0714 -0.0391 0.9688 -16210.5776 15576.3982
Temp         111.3233   10.9606 10.1567 0.0000     89.8409   132.8057
Rain          19.8241    3.0770  6.4426 0.0000  