In [16]:
import pandas as pd
import numpy as np
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import joblib

import mysql.connector
from mysql.connector import Error


def connect_to_database(database, host='localhost', user='root', password='password', v=True):
    try:
        connection = mysql.connector.connect(
            host=host,
            database=database,
            user=user,
            password=password
        )
        
        if connection.is_connected():
            db_info = connection.get_server_info()
            if v:
                print("Connected to MySQL Server version", db_info)
            cursor = connection.cursor()
            cursor.execute("select database();")
            record = cursor.fetchone()
            if v:
                print("You're connected to database:", record)
            return connection, cursor

    except Error as e:
        if v: 
            print("Error while connecting to MySQL", e)

    return None

def close_connection(connection, cursor):
    if cursor:
        cursor.close()
    if connection and connection.is_connected():
        connection.close()
        print("MySQL connection is closed")

def execute_query(connection, cursor, query, params=None):
    try:
        cursor.execute(query, params)
        if query.strip().lower().startswith('select') or query.strip().lower().startswith('show'):
            result = cursor.fetchall()
            return result
        else:
            connection.commit()
            return cursor.rowcount
    except Error as e:
        print("Error executing query:", e)
        return None

def put_res(results, query_type="SELECT"):
    if not results:
        print("No results found.")
        return

    if query_type.lower() in ["select", "show"]:
        for row in results:
            print(row)
    else:
        print(f"Affected rows: {results}")

def fetch_data_to_dataframe(connection, query):
    cursor = None
    try:
        cursor = connection.cursor()
        cursor.execute(query)
        res = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(res, columns=columns)
        return df
    except Error as e:
        print(f"Error fetching data: {e}")
        return None
    finally:
        if cursor:
            cursor.close()





In [2]:
con, cur = connect_to_database('flight_prediction')

Connected to MySQL Server version 8.3.0
You're connected to database: ('flight_prediction',)


In [3]:
query = '''
SELECT 
    ID,
    date,
    dep_time,
    arr_time,
    distance
FROM 
    flight
'''
df_flight = fetch_data_to_dataframe(con, query)



In [4]:
query = '''
SELECT 
    ID,
    arr_delay
FROM 
    delay
'''
df_delay = fetch_data_to_dataframe(con, query)

In [17]:
df = pd.merge(df_delay, df_flight, on='ID', how='inner')
df_copy = df.copy()

In [9]:
df

Unnamed: 0,ID,arr_delay,distance,day_of_week,dep_hour,arr_hour
0,1,-6.0,986,2,11,14
1,2,-12.0,986,3,11,14
2,3,7.0,986,4,11,14
3,4,-5.0,986,5,11,14
4,5,113.0,986,6,11,14
...,...,...,...,...,...,...
5635962,5635963,-13.0,451,5,7,8
5635963,5635964,9.0,451,5,13,13
5635964,5635965,-30.0,1440,5,7,11
5635965,5635966,-4.0,368,5,12,12


In [18]:

df = df_copy

# Create a binary target variable: 1 if delayed, 0 otherwise
df['delayed'] = df['arr_delay'].apply(lambda x: 1 if x > 0 else 0)

# Drop the original 'arr_delay' column
df = df.drop(columns=['arr_delay'])

# Define a function to extract features
def extract_features(df):
    df['day_of_week'] = df['date'].dt.dayofweek
    df['dep_hour'] = df['dep_time'] // 100
    df['arr_hour'] = df['arr_time'] // 100
    return df.drop(columns=['date', 'dep_time', 'arr_time'])

# Split the data into features and target
X = df.drop(columns=['delayed'])
y = df['delayed']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', FunctionTransformer(extract_features, validate=False)),
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Save the pipeline to a file
joblib_file = "pipeline.pkl"
model_path="../model"
joblib.dump(pipeline, f"{model_path}/{joblib_file}")

# Load the pipeline from the file
loaded_pipeline = joblib.load(f"{model_path}/{joblib_file}")

# Predict on the test set using the loaded pipeline
y_pred = loaded_pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6813796028012924
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.95      0.80    748877
           1       0.60      0.16      0.25    378317

    accuracy                           0.68   1127194
   macro avg       0.64      0.55      0.52   1127194
weighted avg       0.66      0.68      0.61   1127194

