In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

### Load data

In [3]:
df = pd.read_csv('airlines.csv')
df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


### Check for missing values

In [4]:
print(df.isnull().sum())

Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
UniqueCarrier        0
Origin               0
Dest                 0
Distance             0
dep_delayed_15min    0
dtype: int64


### Drop rows with missing target values

In [5]:
df.drop
df = df.dropna(subset=['dep_delayed_15min'])

### Encode categorical variables

In [6]:
label_encoders = {}
for column in ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

### Extract features and target variable

In [7]:
X = df[['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']]
y = df['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)
X, y


(       Month  DayofMonth  DayOfWeek  DepTime  UniqueCarrier  Origin  Dest  \
 0         10          13          6     1934              0      18    78   
 1          6          12          2     1548             18     217   171   
 2         11          11          4     1422             20     228    59   
 3          2          17          5     1015             15      78   175   
 4          1          28          5     1828             19     174   199   
 ...      ...         ...        ...      ...            ...     ...   ...   
 99995      7          25          2     1618             15     246   224   
 99996      0           9          2      804              4      92    72   
 99997      0          16          1     1901             13      85   131   
 99998      6          19          3     1515             12      79   107   
 99999      2           8          3     1800             19     245   253   
 
        Distance  
 0           732  
 1           834  
 2   

### Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Initialize and train the model

In [9]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

### Make predictions

In [10]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 0, 0])

### Evaluate the model

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.83      0.99      0.90     16222
           1       0.65      0.11      0.19      3778

    accuracy                           0.82     20000
   macro avg       0.74      0.55      0.54     20000
weighted avg       0.79      0.82      0.77     20000



### Function to preprocess and predict if a flight will be delayed

In [18]:
def predict_delay(model, label_encoders, input_data):
    input_df = pd.DataFrame([input_data])

    # Encode categorical variables
    for column in ['UniqueCarrier', 'Origin', 'Dest']:
        if column in input_df.columns:
            le = label_encoders[column]
            input_df[column] = le.transform(input_df[column])
    
    # Ensure the input data has the correct feature columns
    features = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']
    input_df = input_df[features]

    prediction = model.predict(input_df)

    return 'Delayed' if prediction[0] == 1 else 'On Time'

# test data
new_flight_data = {
    'Month': 10,
    'DayofMonth': 13,
    'DayOfWeek': 6,
    'DepTime': 19,
    'UniqueCarrier': 'AA',  
    'Origin': 'ATL',        
    'Dest': 'DFW',          
    'Distance': 153
}

# Print prediction for the example new input data
print(predict_delay(model, label_encoders, new_flight_data))

Delayed
