In [106]:
import random
import os
import numpy as np
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from lightgbm import LGBMClassifier

In [107]:
import warnings
warnings.filterwarnings(action='ignore')

In [108]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Fixed Seed

In [109]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [110]:
train = pd.read_csv('re_train_preprocessed.csv')
test = pd.read_csv('re_test_preprocessed.csv')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [111]:
train.describe()

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Distance,Carrier_ID(DOT)
count,1000000.0,1000000.0,890981.0,890960.0,1000000.0,988117.0
mean,6.945156,15.764842,615.812905,700.91659,784.078499,19997.529488
std,3.462506,8.763515,439.885738,438.833267,590.790469,404.266115
min,1.0,1.0,0.0,0.0,16.0,19393.0
25%,4.0,8.0,285.0,375.0,350.0,19790.0
50%,7.0,16.0,560.0,654.0,623.0,19977.0
75%,10.0,23.0,820.0,921.0,1020.0,20378.0
max,12.0,31.0,2359.0,2400.0,5095.0,21171.0


In [112]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Origin_Airport            1000000 non-null  object 
 6   Origin_State              1000000 non-null  object 
 7   Destination_Airport       1000000 non-null  object 
 8   Destination_State         1000000 non-null  object 
 9   Distance                  1000000 non-null  int64  
 10  Airline                   988117 non-null   object 
 11  Carrier_Code(IATA)        998727 non-null   object 
 12  Carrier_ID(DOT)           988117 non-null   float64
 13  Tail_Number               10

In [137]:
train.head()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_State,Destination_Airport,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,252,34,HOU,Texas,419,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,40.0,264.0,256,11,SLC,Utah,1250,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,670.0,785.0,74,31,LGA,New York,544,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,245.0,575.0,195,4,EWR,New Jersey,2454,United Air Lines Inc.,UA,19977.0,N595UA,
4,TRAIN_000004,1,11,60.0,139.0,322,4,ACV,California,250,SkyWest Airlines Inc.,UA,20304.0,N161SY,


In [138]:
train['Delay'].describe()

count          255001
unique              2
top       Not_Delayed
freq           210001
Name: Delay, dtype: object

In [233]:
first_test_set = train[train['Delay'].isnull()]
first_train_set = train.dropna()

In [234]:
first_train_set.dropna()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_State,Destination_Airport,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
6,TRAIN_000006,1,20,762.0,843.0,119,28,BOS,Massachusetts,200,United Air Lines Inc.,UA,19977.0,N66825,Not_Delayed
8,TRAIN_000008,6,13,560.0,650.0,59,18,CLT,North Carolina,361,Southwest Airlines Co.,WN,19393.0,N765SW,Not_Delayed
10,TRAIN_000010,8,13,750.0,824.0,93,47,PIT,Pennsylvania,204,Republic Airlines,AA,20452.0,N119HQ,Delayed
12,TRAIN_000012,1,12,195.0,405.0,72,33,DEN,Colorado,1201,Southwest Airlines Co.,WN,19393.0,N8696E,Not_Delayed
13,TRAIN_000013,9,19,1395.0,66.0,215,42,DEN,Colorado,563,SkyWest Airlines Inc.,UA,20304.0,N165SY,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999955,TRAIN_999955,4,12,310.0,356.0,320,48,PDX,Oregon,129,Horizon Air,AS,19687.0,N432QX,Delayed
999963,TRAIN_999963,5,2,779.0,866.0,204,30,DCA,Virginia,214,Republic Airlines,DL,20452.0,N871RW,Delayed
999969,TRAIN_999969,10,10,220.0,296.0,223,42,IAH,Texas,316,Mesa Airlines Inc.,UA,20378.0,N89321,Delayed
999985,TRAIN_999985,8,8,854.0,939.0,296,31,JAX,Florida,407,Frontier Airlines Inc.,F9,20436.0,N316FR,Not_Delayed


In [235]:
first_test_set['Delay'] = first_test_set['Delay'].fillna(0)
first_test_set = first_test_set.dropna()

In [236]:
first_test_X = first_test_set.drop(columns=['Delay', 'ID', 'Airline', 'Carrier_Code(IATA)'])
first_test_y = first_test_set[['Delay']]
first_train_X = first_train_set.drop(columns=['Delay', 'ID', 'Airline', 'Carrier_Code(IATA)'])
first_train_y = first_train_set[['Delay']]

In [237]:
first_train_X.head()

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_State,Destination_Airport,Destination_State,Distance,Carrier_ID(DOT),Tail_Number
6,1,20,762.0,843.0,119,28,BOS,Massachusetts,200,19977.0,N66825
8,6,13,560.0,650.0,59,18,CLT,North Carolina,361,19393.0,N765SW
10,8,13,750.0,824.0,93,47,PIT,Pennsylvania,204,20452.0,N119HQ
12,1,12,195.0,405.0,72,33,DEN,Colorado,1201,19393.0,N8696E
13,9,19,1395.0,66.0,215,42,DEN,Colorado,563,20304.0,N165SY


In [238]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(first_train_X[i])
    first_train_X[i]=le.transform(first_train_X[i])
    
    for label in np.unique(first_test_X[i]):
        print(label, i)
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    first_test_X[i]=le.transform(first_test_X[i])
print('Done.')

0 Origin_Airport
1 Origin_Airport
2 Origin_Airport
3 Origin_Airport
4 Origin_Airport
5 Origin_Airport
6 Origin_Airport
7 Origin_Airport
8 Origin_Airport
9 Origin_Airport
10 Origin_Airport
11 Origin_Airport
12 Origin_Airport
13 Origin_Airport
14 Origin_Airport
15 Origin_Airport
16 Origin_Airport
17 Origin_Airport
18 Origin_Airport
19 Origin_Airport
20 Origin_Airport
21 Origin_Airport
22 Origin_Airport
23 Origin_Airport
24 Origin_Airport
25 Origin_Airport
26 Origin_Airport
27 Origin_Airport
28 Origin_Airport
29 Origin_Airport
30 Origin_Airport
31 Origin_Airport
32 Origin_Airport
33 Origin_Airport
34 Origin_Airport
35 Origin_Airport
36 Origin_Airport
37 Origin_Airport
38 Origin_Airport
39 Origin_Airport
40 Origin_Airport
41 Origin_Airport
42 Origin_Airport
43 Origin_Airport
44 Origin_Airport
45 Origin_Airport
46 Origin_Airport
47 Origin_Airport
48 Origin_Airport
49 Origin_Airport
50 Origin_Airport
51 Origin_Airport
52 Origin_Airport
53 Origin_Airport
54 Origin_Airport
55 Origin_Airport
56

In [239]:
# 교육 데이터는 교육 및 검증 세트로 분할되고 수치 기능은 StandardScaler를 사용하여 정규화됩니다.
# 모델은 GridSearchCV와 5겹 교차 검증을 사용하여 수행되는 하이퍼파라미터 튜닝과 함께 XGBClassifier를 사용하여 훈련됩니다.
# Split the training dataset into a training set and a validation set
first_train_X, first_val_X, first_train_y, first_val_y = train_test_split(first_train_X, first_train_y, test_size=0.3, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
first_train_X = scaler.fit_transform(first_train_X)
first_val_X = scaler.transform(first_val_X)
first_test_X = scaler.transform(first_test_X)


In [240]:
# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [241]:
first_train_y.loc[first_train_y['Delay']=='Delayed', 'Delay'] = False
first_train_y.loc[first_train_y['Delay']=='Not_Delayed', 'Delay'] = True
first_val_y.loc[first_val_y['Delay']=='Delayed', 'Delay'] = False
first_val_y.loc[first_val_y['Delay']=='Not_Delayed', 'Delay'] = True
first_train_y

Unnamed: 0,Delay
921378,False
995250,True
222874,True
468115,False
645592,True
...,...
598975,False
517378,True
659408,False
733109,True


In [242]:
train_x = pd.DataFrame(first_train_X)
train_y = pd.DataFrame(first_train_y)
val_x = pd.DataFrame(first_val_X)
val_y = pd.DataFrame(first_val_y)
test_x = pd.DataFrame(first_test_X)

In [251]:
train_x = train_x.astype('int')
train_y = train_y.astype('int')
val_x = val_x.astype('int')
val_y = val_y.astype('int')

In [284]:
# Model and hyperparameter tuning using GridSearchCV
# model = LGBMClassifier(n_estimators=1000, objective='binary', metric = 'binary_logloss', reg_lambda=True, max_depth=7, is_unbalance = True, learning_rate=0.01, n_jobs=-1, random_state=42)
model = LGBMClassifier(n_estimators=1000, max_depth=7, learning_rate=0.005, n_jobs=-1, objective='binary', metric='binary_logloss')

In [285]:
model.fit(train_x, train_y, eval_metric='logloss', eval_set = [(val_x, val_y)])

[1]	valid_0's binary_logloss: 0.469226
[2]	valid_0's binary_logloss: 0.46912
[3]	valid_0's binary_logloss: 0.469015
[4]	valid_0's binary_logloss: 0.46891
[5]	valid_0's binary_logloss: 0.468807
[6]	valid_0's binary_logloss: 0.468706
[7]	valid_0's binary_logloss: 0.468604
[8]	valid_0's binary_logloss: 0.468504
[9]	valid_0's binary_logloss: 0.468405
[10]	valid_0's binary_logloss: 0.468307
[11]	valid_0's binary_logloss: 0.46821
[12]	valid_0's binary_logloss: 0.468114
[13]	valid_0's binary_logloss: 0.468019
[14]	valid_0's binary_logloss: 0.467925
[15]	valid_0's binary_logloss: 0.467832
[16]	valid_0's binary_logloss: 0.46774
[17]	valid_0's binary_logloss: 0.467649
[18]	valid_0's binary_logloss: 0.467559
[19]	valid_0's binary_logloss: 0.46747
[20]	valid_0's binary_logloss: 0.467382
[21]	valid_0's binary_logloss: 0.467295
[22]	valid_0's binary_logloss: 0.467208
[23]	valid_0's binary_logloss: 0.467124
[24]	valid_0's binary_logloss: 0.467039
[25]	valid_0's binary_logloss: 0.466955
[26]	valid_0's

LGBMClassifier(learning_rate=0.005, max_depth=7, metric='binary_logloss',
               n_estimators=1000, objective='binary')

In [333]:
train = pd.read_csv('re_train_preprocessed.csv')
test = pd.read_csv('re_test_preprocessed.csv')

In [334]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [335]:
train = train.drop(columns=['ID','Airline','Carrier_Code(IATA)'])
test = test.drop(columns=['ID','Airline','Carrier_Code(IATA)'])

In [336]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [337]:
y_pred = model.predict_proba(test)

In [338]:
y_pred

array([[0.09887419, 0.90112581],
       [0.09887419, 0.90112581],
       [0.09887419, 0.90112581],
       ...,
       [0.09887419, 0.90112581],
       [0.09887419, 0.90112581],
       [0.09887419, 0.90112581]])

In [340]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_2.csv', index=True)