In [5]:
!unzip '/content/월간항공 지연 예측.zip'

Archive:  /content/월간항공 지연 예측.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [6]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

메모리 사용량을 줄임
  - parquet

In [8]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')    

train Done.
test Done.


In [9]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [10]:
train.head()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,


결측치가 많이 보임
  - 범주형 데이터는 최빈값(가장 많이 등장하는 값)으로  train[컬럼].mode()
  


In [11]:
train = train.drop(columns=['ID'])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int64  
 1   Day_of_Month              1000000 non-null  int64  
 2   Estimated_Departure_Time  890981 non-null   float64
 3   Estimated_Arrival_Time    890960 non-null   float64
 4   Cancelled                 1000000 non-null  int64  
 5   Diverted                  1000000 non-null  int64  
 6   Origin_Airport            1000000 non-null  object 
 7   Origin_Airport_ID         1000000 non-null  int64  
 8   Origin_State              890985 non-null   object 
 9   Destination_Airport       1000000 non-null  object 
 10  Destination_Airport_ID    1000000 non-null  int64  
 11  Destination_State         890921 non-null   object 
 12  Distance                  1000000 non-null  float64
 13  Airline                   89

데이터 프레임에서 object 컬럼만 추출하기

In [12]:
object_columns = train.select_dtypes(include='object').columns
object_columns

Index(['Origin_Airport', 'Origin_State', 'Destination_Airport',
       'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number',
       'Delay'],
      dtype='object')

object 컬럼 전처리
  - 최빈값으로 대처하기

In [13]:
train[object_columns] = train[object_columns].fillna(train[object_columns].mode().iloc[0])

In [14]:
train.isnull().mean()[train.isnull().sum() > 0]

Estimated_Departure_Time    0.109019
Estimated_Arrival_Time      0.109040
Carrier_ID(DOT)             0.108997
dtype: float64

결측치가 10% 정도


In [15]:
train = train.dropna()

null 인 항목이 한개 이상인 Row에 대해서 출력한다

In [16]:
train[train.isnull().any(axis=1)]

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay


제출파일

In [17]:
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)
sample_submission.columns

Index(['Not_Delayed', 'Delayed'], dtype='object')

In [18]:
column_number = dict(zip(sample_submission.columns,[0,1]))

Delay 컬럼이 Not_Delayed, Delayed 로 범주형 문자열로 되어 있는데 

이를 column_number 가 가지고 있는 dic의 키 값으로 대체하는 방법들... 여기서는 3번을 사용

In [19]:
train['Delay'].replace(column_number)  # 1
train['Delay'].apply(lambda x : column_number[x]) # 2
train['Delay_num'] = train['Delay'].map(column_number) # 3

In [20]:
train.head(2)

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
1,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,Not_Delayed,0
2,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,Not_Delayed,0


In [21]:
train_x = train.drop(columns=['Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [22]:
train_x.shape, train_y.shape, test_x.shape

((707317, 17), (707317,), (1000000, 17))

In [None]:
!pip install pycaret

AutML로 최적의 머신러닝 모델을 추론한다.

In [24]:
from pycaret import classification
classification.setup(data = train_x, target = train_y)

Unnamed: 0,Description,Value
0,Session id,1974
1,Target,Delay_num
2,Target type,Binary
3,Original data shape,"(707317, 18)"
4,Transformed data shape,"(707317, 28)"
5,Transformed train set shape,"(495121, 28)"
6,Transformed test set shape,"(212196, 28)"
7,Numeric features,10
8,Categorical features,7
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x7fa962da9720>

In [25]:
classification.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


자동화 툴에 평가방법 추가

In [26]:
from sklearn.metrics import log_loss
classification.add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

Name                                                       Log Loss
Display Name                                               Log Loss
Score Function                <function log_loss at 0x7fa9c48001f0>
Scorer               make_scorer(log_loss, greater_is_better=False)
Target                                                         pred
Args                                                             {}
Greater is Better                                             False
Multiclass                                                     True
Custom                                                         True
Name: logloss, dtype: object

In [27]:
# 1차로 돌렸을때 linearRegression이 가장 좋았음 나머지 는 시간관계상 제외

# best = classification.compare_models(n_select=5, exclude=['lr','knn','ridge',
# 'ada',
# 'gbc',
# 'lda',
# 'rf',
# 'svm',
# 'nb',
# 'dt',
# 'qda',
# 'et'
# ])

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [32]:
qual_col = train_x.select_dtypes(include='object').columns

Index(['Origin_Airport', 'Origin_State', 'Destination_Airport',
       'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number'],
      dtype='object')

In [None]:
test_x

In [39]:
#질적 변수들을 수치화합니다
qual_col = train_x.select_dtypes(include='object').columns

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train_x[i]=le.transform(train_x[i])
    try:
      for label in np.unique(test_x[i]):
          if label not in le.classes_: 
              le.classes_ = np.append(le.classes_, label)
      test_x[i]=le.transform(test_x[i])
    except Exception as e :
      print(i, e)
      pass

In [46]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'] ,
    'max_iter': [100, 1000, 10000]
}

In [None]:
grid_search  = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(train_x,train_y)