In [5]:

import statsmodels.api as sm
import pandas as pd
import numpy as np
import pycaret.regression as pycr
import pycaret.utils as pycu


df = sm.datasets.get_rdataset('flights', 'nycflights13').data
df.info()


df.isnull().sum()

df.dropna(inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute    

In [6]:

df['arr_hour'] = df.arr_time.apply(lambda x: int(np.floor(x/100)))
df['arr_minute'] = df.arr_time.apply(lambda x: int(x - np.floor(x/100)*100))
df['sched_arr_hour'] = df.sched_arr_time.apply(lambda x: int(np.floor(x/100)))
df['sched_arr_minute'] = df.sched_arr_time.apply(lambda x: int(x - np.floor(x/100)*100))
df['sched_dep_hour'] = df.sched_dep_time.apply(lambda x: int(np.floor(x/100)))
df['sched_dep_minute'] = df.sched_dep_time.apply(lambda x: int(x - np.floor(x/100)*100))
df.rename(columns={'hour': 'dep_hour', 'minute': 'dep_minute'}, inplace=True)
                   

dftrain = df.sample(n=100000, random_state=1066)
dftest = df.drop(dftrain.index)

                   
pycaret_experiment = pycr.setup(data=dftrain, target="arr_delay", session_id=1066, ignore_features=['flight', 'tailnum', 'time_hour', 'year', 'dep_time', 'sched_dep_time', 'arr_time', 'sched_arr_time', 'dep_delay'])
                                

best = pycr.compare_models(sort='RMSE')


Unnamed: 0,Description,Value
0,Session id,1066
1,Target,arr_delay
2,Target type,Regression
3,Original data shape,"(100000, 25)"
4,Transformed data shape,"(100000, 33)"
5,Transformed train set shape,"(70000, 33)"
6,Transformed test set shape,"(30000, 33)"
7,Ignore features,9
8,Numeric features,12
9,Categorical features,3


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.0864,27.8103,4.8506,0.9862,0.1096,0.0621,12.539
rf,Random Forest Regressor,1.1483,33.6107,5.3443,0.9833,0.1199,0.0676,17.266
xgboost,Extreme Gradient Boosting,3.2496,47.3098,6.64,0.9764,0.3408,0.2792,0.646
dt,Decision Tree Regressor,1.9128,53.6412,6.9478,0.9732,0.2096,0.1316,0.472
lightgbm,Light Gradient Boosting Machine,6.3995,159.4863,12.3618,0.9205,0.532,0.4375,0.651
gbr,Gradient Boosting Regressor,14.9391,580.2871,24.0405,0.7091,1.1931,0.8735,4.107
knn,K Neighbors Regressor,21.7655,1730.0311,41.5789,0.1311,1.1394,1.4666,1.703
ridge,Ridge Regression,25.8476,1783.8993,42.225,0.104,1.3866,1.7588,0.228
lr,Linear Regression,25.8479,1783.9424,42.2256,0.1039,1.3865,1.7589,1.768
br,Bayesian Ridge,25.8504,1784.1373,42.2279,0.1038,1.3877,1.7558,0.274


In [None]:

# PyCaret is an open-source, low-code machine learning library in Python that simplifies the end-to-end machine learning 
# process. It is designed to streamline and automate various tasks involved in machine learning, making it easier for both 
# beginners and experienced data scientists to experiment with different models, perform feature engineering, 
# hyperparameter tuning, and more.

# Overall, PyCaret is designed to accelerate the machine learning workflow, reduce the amount of code needed for common 
# tasks, and provide a user-friendly interface for individuals who want to quickly experiment with machine learning models 
# without delving into intricate coding details.
