## 데이터 전처리

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data load 함수
def load_data() :
    df_train = pd.read_csv('train.csv')
    df_test = pd.read_csv('test.csv')
    df_sub = pd.read_csv('sample_submission.csv')
    return df_train, df_test, df_sub

In [9]:
# 데이터 전처리 함수 
def preprocess_data(df_train, df_test) : 
    
    # train 데이터 타겟값을 0, 1로 변경 
    df_train['Delay'].replace('Not_Delayed', 0, inplace=True)
    df_train['Delay'].replace('Delayed', 1, inplace=True)
    
    # 타겟값이 null이 아닌 데이터만 추출
    df_train = df_train[df_train['Delay'].notnull()]
    
    # train, test 데이터를 합쳐 전처리 
    df = pd.concat([df_train, df_test])
    
    # 불필요한 피쳐 drop
    df.drop(['Cancelled', 'Diverted', 'Origin_Airport_ID', 'Destination_Airport_ID',
            'Carrier_ID(DOT)'], axis=1, inplace=True)
    
    # 출발/도착 시간 (hour) 컬럼 생성
    df['Estimated_Departure_Time_Hour'] = df['Estimated_Departure_Time'].astype(str).apply(lambda x : x.split('.')[0].zfill(4)[:-2])
    df['Estimated_Departure_Time_Hour'] = df['Estimated_Departure_Time_Hour'].replace('0n', np.nan).astype(float)
    df['Estimated_Arrival_Time_Hour'] = df['Estimated_Arrival_Time'].astype(str).apply(lambda x : x.split('.')[0].zfill(4)[:-2])
    df['Estimated_Arrival_Time_Hour'] = df['Estimated_Arrival_Time_Hour'].replace('0n', np.nan).astype(float)
    
    return df

In [10]:
# 각 컬럼별 지연율(파생컬럼) 생성 + 라벨 인코딩 함수 

def df_merge_agg(df) :
    # Month agg 연산
    month_grp = df.groupby('Month')
    month_dict = {
        'Month' : ['count'],
        'Delay' : ['sum']
    }
    month_agg = month_grp.agg(month_dict)
    month_agg.columns = ['count', 'sum']
    month_agg['month_delay_ratio'] = month_agg['sum'] / month_agg['count']
    month_agg = month_agg['month_delay_ratio'].reset_index()
    
    # Day agg 연산
    day_grp = df.groupby('Day_of_Month')
    day_dict = {
        'Day_of_Month' : ['count'],
        'Delay' : ['sum']
    }
    day_agg = day_grp.agg(day_dict)
    day_agg.columns = ['count', 'sum']
    day_agg['day_delay_ratio'] = day_agg['sum'] / day_agg['count']
    day_agg = day_agg['day_delay_ratio'].reset_index()
    
    # 출발 시간 agg 연산
    depart_hour_grp = df.groupby('Estimated_Departure_Time_Hour')
    depart_hour_dict = {
        'Estimated_Departure_Time_Hour' : ['count'],
        'Delay' : ['sum']
    }
    depart_hour_agg = depart_hour_grp.agg(depart_hour_dict)
    depart_hour_agg.columns = ['count', 'sum']
    depart_hour_agg['depart_hour_ratio'] = depart_hour_agg['sum'] / depart_hour_agg['count']
    depart_hour_agg = depart_hour_agg['depart_hour_ratio'].reset_index()
    
    # 도착 시간 agg 연산
    arrival_hour_grp = df.groupby('Estimated_Arrival_Time_Hour')
    arrival_hour_dict = {
        'Estimated_Arrival_Time_Hour' : ['count'],
        'Delay' : ['sum']
    }
    arrival_hour_agg = arrival_hour_grp.agg(arrival_hour_dict)
    arrival_hour_agg.columns = ['count', 'sum']
    arrival_hour_agg['arrival_hour_ratio'] = arrival_hour_agg['sum'] / arrival_hour_agg['count']
    arrival_hour_agg = arrival_hour_agg['arrival_hour_ratio'].reset_index()
    
    # 출발 공항 agg 연산
    origin_airport_grp = df.groupby('Origin_Airport')
    origin_airport_dict = {
        'Origin_Airport' : ['count'],
        'Delay' : ['sum']
    }
    origin_airport_agg = origin_airport_grp.agg(origin_airport_dict)
    origin_airport_agg.columns = ['count', 'sum']
    origin_airport_agg['origin_airport_delay_ratio'] = origin_airport_agg['sum'] / origin_airport_agg['count']
    origin_airport_agg = origin_airport_agg['origin_airport_delay_ratio'].reset_index()
    
    # 출발 주 agg 연산
    origin_state_grp = df.groupby('Origin_State')
    origin_state_dict = {
        'Origin_State' : ['count'],
        'Delay' : ['sum']
    }
    origin_state_agg = origin_state_grp.agg(origin_state_dict)
    origin_state_agg.columns = ['count', 'sum']
    origin_state_agg['origin_state_delay_ratio'] = origin_state_agg['sum'] / origin_state_agg['count']
    origin_state_agg = origin_state_agg['origin_state_delay_ratio'].reset_index()
    
    # 도착 공항 agg 연산
    dest_airport_grp = df.groupby('Destination_Airport')
    dest_airport_dict = {
        'Destination_Airport' : ['count'],
        'Delay' : ['sum']
    }
    dest_airport_agg = dest_airport_grp.agg(dest_airport_dict)
    dest_airport_agg.columns = ['count', 'sum']
    dest_airport_agg['dest_airport_delay_ratio'] = dest_airport_agg['sum'] / dest_airport_agg['count']
    dest_airport_agg = dest_airport_agg['dest_airport_delay_ratio'].reset_index()
    
    # 도착 주 agg 연산
    dest_state_grp = df.groupby('Destination_State')
    dest_state_dict = {
        'Destination_State' : ['count'],
        'Delay' : ['sum']
    }
    dest_state_agg = dest_state_grp.agg(dest_state_dict)
    dest_state_agg.columns = ['count', 'sum']
    dest_state_agg['dest_state_delay_ratio'] = dest_state_agg['sum'] / dest_state_agg['count']
    dest_state_agg = dest_state_agg['dest_state_delay_ratio'].reset_index()
    
    # 항공사 agg 연산
    airline_grp = df.groupby('Airline')
    airline_dict = {
        'Airline' : ['count'],
        'Delay' : ['sum']
    }
    airline_agg = airline_grp.agg(airline_dict)
    airline_agg.columns = ['count', 'sum']
    airline_agg['airline_delay_ratio'] = airline_agg['sum'] / airline_agg['count']
    airline_agg = airline_agg['airline_delay_ratio'].reset_index()
    
    # Carrier_Code(IATA) agg 연산
    carrier_grp = df.groupby('Carrier_Code(IATA)')
    carrier_dict = {
        'Carrier_Code(IATA)' : ['count'],
        'Delay' : ['sum']
    }
    carrier_agg = carrier_grp.agg(carrier_dict)
    carrier_agg.columns = ['count', 'sum']
    carrier_agg['carrier_delay_ratio'] = carrier_agg['sum'] / carrier_agg['count']
    carrier_agg = carrier_agg['carrier_delay_ratio'].reset_index()
    
    # 항공기 agg 연산
    tail_grp = df.groupby('Tail_Number')
    tail_dict = {
        'Tail_Number' : ['count'],
        'Delay' : ['sum']
    }
    tail_agg = tail_grp.agg(tail_dict)
    tail_agg.columns = ['count', 'sum']
    tail_agg['tail_delay_ratio'] = tail_agg['sum'] / tail_agg['count']
    tail_agg = tail_agg['tail_delay_ratio'].reset_index()
    
    # df와 merge
    df = df.merge(month_agg, on='Month', how='left')
    df = df.merge(day_agg, on='Day_of_Month', how='left')
    df = df.merge(depart_hour_agg, on='Estimated_Departure_Time_Hour', how='left')
    df = df.merge(arrival_hour_agg, on='Estimated_Arrival_Time_Hour', how='left')
    df = df.merge(origin_airport_agg, on='Origin_Airport', how='left')
    df = df.merge(origin_state_agg, on='Origin_State', how='left')
    df = df.merge(dest_airport_agg, on='Destination_Airport', how='left')
    df = df.merge(dest_state_agg, on='Destination_State', how='left')
    df = df.merge(airline_agg, on='Airline', how='left')
    df = df.merge(carrier_agg, on='Carrier_Code(IATA)', how='left')
    df = df.merge(tail_agg, on='Tail_Number', how='left')
    
    # 카테고리 컬럼 라벨링
    object_columns = ['Origin_Airport', 'Origin_State', 'Destination_Airport',
                     'Destination_State', 'Airline', 'Carrier_Code(IATA)', 
                     'Tail_Number']
    for column in object_columns : 
        df[column] = pd.factorize(df[column])[0]
    
    return df

In [15]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from imblearn.over_sampling import RandomOverSampler

# 학습 / 예측 / 평가 함수
def lgbm_fit_eval(df) :
    X = df.drop(['ID', 'Delay'], axis=1)
    y = df['Delay']
        
    # 학습 데이터에 오버 샘플링 적용
    randomOS = RandomOverSampler(random_state=2020)
    X_over, y_over = randomOS.fit_resample(X, y)
    train_X, valid_X, train_y, valid_y = train_test_split(X_over, y_over, test_size=0.3, random_state=2020, stratify=y_over)

    lgbm = LGBMClassifier(n_jobs=-1, n_estimators=4000,
                     learning_rate=0.02, silent=-1, verbose=-1,
                         max_depth=13, max_bin=41, min_child_weight=3,
                         num_leaves=60, reg_alpha=6.532, reg_lambda=2.818)

    lgbm.fit(train_X, train_y, eval_set=[(train_X, train_y), (valid_X, valid_y)],
            eval_metric='logloss', verbose=100, early_stopping_rounds=200)
    return lgbm

In [11]:
df_train, df_test, df_sub = load_data()
df_train.shape, df_test.shape

((1000000, 19), (1000000, 18))

In [12]:
df = preprocess_data(df_train, df_test)
df.shape

(1255001, 16)

In [13]:
df = df_merge_agg(df)
df.shape

(1255001, 27)

In [14]:
# train, test 분할
df_test = df[df['Delay'].isnull()]
df_train = df[df['Delay'].notnull()]

df_test.shape, df_train.shape

((1000000, 27), (255001, 27))

## OOF 

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from imblearn.over_sampling import RandomOverSampler

def train_oof(df_train, df_test, nfolds=5) :
    X = df_train.drop(['ID', 'Delay'], axis=1)
    y = df_train['Delay']
   
    randomOS = RandomOverSampler(random_state=2020)
    X_over, y_over = randomOS.fit_resample(X, y)
    
    folds = KFold(n_splits=nfolds, shuffle=True, random_state=2020)
    
    oof_preds = np.zeros((X_over.shape[0],2))
    test_preds = np.zeros((df_test.shape[0],2))
    
    lgbm = LGBMClassifier(
        n_jobs=-1, n_estimators=4000,
        learning_rate=0.02, silent=-1, verbose=-1,
        max_depth=13, max_bin=41, min_child_weight=3,
        num_leaves=60, reg_alpha=6.532, reg_lambda=2.818
    )
    
    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X_over)) :
        print('##### iteration ', fold_idx, ' 시작')
        train_X  = X_over.iloc[train_idx, :]
        train_y = y_over.iloc[train_idx]
        valid_X = X_over.iloc[valid_idx, :]
        valid_y = y_over.iloc[valid_idx]
        
        lgbm.fit(train_X, train_y, eval_set=[(train_X, train_y), (valid_X, valid_y)],
                eval_metric='logloss', verbose=200, early_stopping_rounds=200)
        
        oof_preds[valid_idx] = lgbm.predict_proba(valid_X, num_iteration=lgbm.best_iteration_) 
        test_preds += lgbm.predict_proba(df_test.drop(['ID', 'Delay'], axis=1), num_iteration=lgbm.best_iteration_) / folds.n_splits
    
    return lgbm, test_preds

In [None]:
lgbm, test_preds = train_oof(df_train, df_test, nfolds=5)

In [None]:
submit = pd.read_csv('sample_submission.csv')
submit[['Not_Delayed', 'Delayed']] = test_preds
submit.to_csv('submission_v5.csv', index=False)