<a href="https://colab.research.google.com/github/abdiasis-Hassan/zindi-Compete/blob/main/Flight_Delay_Prediction_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
from google.colab import files

# Load the data from the uploaded file
file_path = '/content/Train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()


Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['DEPSTN', 'ARRSTN', 'FLTID', 'STATUS', 'AC']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Correcting the parsing of the STA column and converting date-time columns
data['STA'] = data['STA'].str.replace('.', ':', regex=False)
data['DATOP'] = pd.to_datetime(data['DATOP'])
data['STD'] = pd.to_datetime(data['STD'])
data['STA'] = pd.to_datetime(data['STA'], format='%Y-%m-%d %H:%M:%S')

# Feature Engineering: Extracting day of the week, month, and hour
data['day_of_week'] = data['DATOP'].dt.dayofweek
data['month'] = data['DATOP'].dt.month
data['departure_hour'] = data['STD'].dt.hour
data['arrival_hour'] = data['STA'].dt.hour

# Dropping the original date and time columns and the ID column
data = data.drop(['ID', 'DATOP', 'STD', 'STA'], axis=1)

# Splitting the data into training and testing sets
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training using Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Model Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)


Model Evaluation Metrics:
Mean Absolute Error (MAE): 52.78978846438374
Mean Squared Error (MSE): 14072.76624795608
Root Mean Squared Error (RMSE): 118.62869066105416


In [28]:
# Load the test dataset
test_file_path = '/content/Test.csv'
test_data = pd.read_csv(test_file_path)

# Preprocess the test dataset similar to the training dataset

# Encoding categorical variables
for col in categorical_columns:
    test_data[col] = label_encoder.fit_transform(test_data[col])

# Correcting the parsing of the STA column and converting date-time columns
test_data['STA'] = test_data['STA'].str.replace('.', ':', regex=False)
test_data['DATOP'] = pd.to_datetime(test_data['DATOP'])
test_data['STD'] = pd.to_datetime(test_data['STD'])
test_data['STA'] = pd.to_datetime(test_data['STA'], format='%Y-%m-%d %H:%M:%S')

# Feature Engineering
test_data['day_of_week'] = test_data['DATOP'].dt.dayofweek
test_data['month'] = test_data['DATOP'].dt.month
test_data['departure_hour'] = test_data['STD'].dt.hour
test_data['arrival_hour'] = test_data['STA'].dt.hour

# Dropping the original date and time columns and the ID column
test_data = test_data.drop(['ID', 'DATOP', 'STD', 'STA'], axis=1)

# Ensure the test dataset has the same feature columns as the training set
X_test = test_data[X_train.columns]

# Using the model to make predictions on the test dataset
test_predictions = model.predict(X_test)
test_predictions[:10]  # Displaying first 10 predictions



array([ 59.37166667,  91.96166667, 254.48066667, 209.05866667,
        54.17833333, 348.89333333, 180.34683333,  63.84283333,
       103.83297619, 256.78583333])

In [32]:
# Creating a submission DataFrame with specific IDs
# Assuming the length of test_predictions matches the required number of IDs

# Generating the required IDs
num_rows = len(test_predictions)
required_ids = ['test_id_' + str(i) for i in range(num_rows)]

submission = pd.DataFrame({
    "ID": required_ids,
    "Predicted_Delay": test_predictions
})

# Display the first few rows of the submission file
print(submission.head())
submission.to_csv('second_submission.csv', index = False)
files.download('second_submission.csv')

          ID  Predicted_Delay
0  test_id_0        59.371667
1  test_id_1        91.961667
2  test_id_2       254.480667
3  test_id_3       209.058667
4  test_id_4        54.178333


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# Creating a submission DataFrame
# 'ID' is assumed to be the unique identifier. If 'ID' does not exist, use the DataFrame index.
if 'ID' in test_data.columns:
    unique_identifier = test_data['ID']
else:
    unique_identifier = test_data.index

submission = pd.DataFrame({
    "ID": unique_identifier,
    "Predicted_Delay": test_predictions
})

# Display the first few rows of the submission file
print(submission.head())
submission.to_csv('first_submission.csv', index = False)
files.download('first_submission.csv')

   ID  Predicted_Delay
0   0        59.371667
1   1        91.961667
2   2       254.480667
3   3       209.058667
4   4        54.178333


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>