In [30]:
%pip install scikit-learn





In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report



In [2]:
# read the file from a filepath to its csv into a dataframe
file_path = r"C:\Users\raaja\OneDrive\Documents\maichu development\notebooks\merged_data_with_labels.csv"
df = pd.read_csv(file_path)

In [3]:
# get class value counts to see if the data set is imbalanced
class_counts = df['IsDelayed'].value_counts()

# print the counts 
print(class_counts)

IsDelayed
Not Delayed    1150997
Delayed         700439
Name: count, dtype: int64


In [4]:
# make a list of the desired columns in the data frame with a normal or skewed distribution from EDA
desired_columns = ["windspeedKmph", "winddirDegree", "precipMM", "visibility", "pressure", "cloudcover", "DewPointF", "WindGustKmph", "tempF", "WindChillF", "humidity", "DepDelayMinutes", "IsDelayed"]

# filter the data frame with only the filtered columns by dropping the unrequired columns
df = df.drop(columns=list(set(df.columns) - set(desired_columns)))

In [5]:
# split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(df.drop("IsDelayed", axis=1), df["IsDelayed"], test_size=0.2, random_state=42)

In [6]:
# default model
model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train)


In [7]:
# create a scaled model of the training data 
scaler = StandardScaler()
# fit_transform is used to calculate training data's parameters and scale the testing data according to those parameters using transform (prevent data leakage)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# create and train the logistic regression model
# model = LogisticRegression(solver='sag')

model2 = LogisticRegression() 
model2.fit(X_train_scaled, y_train)


In [8]:
# make predictions on the test set
y_pred = model.predict(X_test)
y2_pred = model2.predict(X_test_scaled)

In [9]:
accuracy = accuracy_score(y_test, y_pred)

accuracy2 = accuracy_score(y_test, y2_pred)

print("Default model accuracy:", accuracy)
print("Scaled input model accuracy:", accuracy2 * 100)
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Default model accuracy: 0.8114764723674546
Scaled input model accuracy: 81.1525083178499
Confusion matrix:
 [[ 80927  58791]
 [ 11017 219553]]
Classification Report:
               precision    recall  f1-score   support

     Delayed       0.88      0.58      0.70    139718
 Not Delayed       0.79      0.95      0.86    230570

    accuracy                           0.81    370288
   macro avg       0.83      0.77      0.78    370288
weighted avg       0.82      0.81      0.80    370288



In [10]:
# assuming "Delayed" is the positive class 
precision = precision_score(y_test, y_pred, pos_label="Delayed") 
print("Default model recision:", precision)

precision2 = precision_score(y_test, y2_pred, pos_label="Delayed") 
print("Scaled input model recision:", precision2)


Default model recision: 0.8801770643000087
Scaled input model recision: 0.880142645907628


In [11]:
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:\n", accuracy_score(y_test, y_pred) * 100)
print("Classification Report:\n", classification_report(y_test, y_pred))

Confusion matrix:
 [[ 80927  58791]
 [ 11017 219553]]
Accuracy:
 81.14764723674546
Classification Report:
               precision    recall  f1-score   support

     Delayed       0.88      0.58      0.70    139718
 Not Delayed       0.79      0.95      0.86    230570

    accuracy                           0.81    370288
   macro avg       0.83      0.77      0.78    370288
weighted avg       0.82      0.81      0.80    370288



In [None]:
from imblearn.over_sampling import SMOTE

# Assuming 'IsDelayed' is the target variable
X = df.drop('IsDelayed', axis=1)  # Features (all columns except target)
y = df['IsDelayed']  # Target variable

# Define and fit the SMOTE oversampler (adjust parameters as needed)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Now you have a rebalanced dataset (X_resampled, y_resampled)
