In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt 
from catboost import CatBoostClassifier,Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
train=pd.read_csv("../input/flight-delays-fall-2018/flight_delays_train.csv.zip")
test=pd.read_csv("../input/flight-delays-fall-2018/flight_delays_test.csv.zip")

In [3]:
train.head()

In [4]:
data=pd.concat([train,test],ignore_index=True)

In [5]:
train.info()
print('-'*45)
test.info()

In [6]:
train['dep_delayed_15min'].loc[train.dep_delayed_15min=='N']=0
train['dep_delayed_15min'].loc[train.dep_delayed_15min=='Y']=1
data['dep_delayed_15min'].loc[data.dep_delayed_15min=='N']=0
data['dep_delayed_15min'].loc[data.dep_delayed_15min=='Y']=1

In [7]:
data['Month']=data['Month'].str.replace('c-','')
data['Month']=data['Month'].str.replace('c-','')
train['Month']=train['Month'].str.replace('c-','')
train['Month']=train['Month'].str.replace('c-','')
data['DayofMonth']=data['DayofMonth'].str.replace('c-','')
data['DayofMonth']=data['DayofMonth'].str.replace('c-','')
train['DayofMonth']=train['DayofMonth'].str.replace('c-','')
train['DayofMonth']=train['DayofMonth'].str.replace('c-','')
data['DayOfWeek']=data['DayOfWeek'].str.replace('c-','')
data['DayOfWeek']=data['DayOfWeek'].str.replace('c-','')
train['DayOfWeek']=train['DayOfWeek'].str.replace('c-','')
train['DayOfWeek']=train['DayOfWeek'].str.replace('c-','')

In [8]:
sns.countplot(x='Month',hue='dep_delayed_15min',data=train)

In [9]:
sns.countplot(x='DayOfWeek',hue='dep_delayed_15min',data=train)

In [10]:
sns.countplot(x='DayOfWeek',data=train)

In [11]:
train['UniqueCarrier'].value_counts()

In [12]:
sns.countplot(x='UniqueCarrier',hue='dep_delayed_15min',data=train)

In [13]:
train['dep_hour']=train["DepTime"]//100
data['dep_hour']=data['DepTime']//100
train['dep_minute']=train["DepTime"]%100
data['dep_min']=data['DepTime']%100
data['dep_hour']=data['dep_hour'].replace([24,25],0)
train['dep_hour']=train['dep_hour'].replace([24,25],0)

In [14]:
sns.countplot(x='dep_hour',hue='dep_delayed_15min',data=train)

In [15]:
sns.displot(x='Distance',hue='dep_delayed_15min',bins=[0,200,500,750,1000,1500,2000,3000,4000,5000],data=train)

In [16]:
sns.countplot(x='DayofMonth',hue='dep_delayed_15min',data=train)

In [17]:
data['Route']=data['Origin'].astype('str')+'_'+data['Dest'].astype('str')
data['holidays']=(data['Month'].astype(int).isin([7,8,12])).astype(int)
data['last_month_days']=(data['DayofMonth'].astype(int).isin(range(20,25))).astype(int)
data["hour_squared"]=data['dep_hour']**2
data['distance_comment']='short'
data.loc[(data.Distance>500)&(data.Distance<2000),'distance_comment']='medium'
data.loc[(data.Distance>=2000),'distance_comment']='long'

In [18]:
data.drop(labels='DepTime',inplace=True,axis=1)
data.drop(labels='dep_min',inplace=True,axis=1)
data.drop(['DayofMonth'],axis=1,inplace=True)

In [19]:
new_train=data.iloc[:100000]
new_test=data.iloc[100000:]
new_test.drop('dep_delayed_15min',axis=1,inplace=True)

In [20]:
class CFG :
    SEED=42
    n_splits=5
    catboost_params = {'learning_rate':0.05,'iterations':10000,'eval_metric':'AUC',
                      'use_best_model' :True,'verbose':100,'random_seed': 0,
                      'devices':'0:1','task_type':"GPU",}
    lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
                'n_estimators': 500,'sub_sample' : 0.7,'colsample_bytree' : 0.6,
                'seed': SEED,'silent':False,'early_stopping_rounds': 100,
               }
    categ_features = ['UniqueCarrier', 'Origin', 'Dest','Route', 'holidays', 'last_month_days','distance_comment']
    TARGET_COL = 'dep_delayed_15min'

In [21]:

skf=StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.SEED)
X,Y=new_train.drop(CFG.TARGET_COL,axis=1),new_train[CFG.TARGET_COL].astype('float')
oof_cat = np.zeros((new_train.shape[0],))
cat_preds=[]
for fold,(trn_idx,val_idx) in enumerate(skf.split(X,Y.astype(int))):
    print(50*'-')
    print('FOLD:',fold+1)
    X_train,Y_train=X.iloc[trn_idx,:],Y.iloc[trn_idx].astype('float')
    X_val,Y_val=X.iloc[val_idx,:],Y.iloc[val_idx].astype('float')
    estimator=CatBoostClassifier(**CFG.catboost_params)
    estimator.fit(Pool(data=X_train,label=Y_train,cat_features=CFG.categ_features),eval_set=Pool(X_val,Y_val,CFG.categ_features),early_stopping_rounds=200)
    y_pred_val=estimator.predict_proba(X_val)[:,1]
    oof_cat[val_idx]=y_pred_val
    y_pred_test=estimator.predict_proba(new_test)[:,1]
    cat_preds.append(y_pred_test)
    print(50*'-')
    print()
print('OOF_score : ',roc_auc_score(Y,oof_cat))
    

In [36]:
catboost_preds=np.mean(cat_preds,axis=0)
submission = pd.DataFrame({'id':range(100000),'dep_delayed_15min':catboost_preds})
submission.to_csv('Flight_delays_pred.csv',index=False)

In [37]:
submission