In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rarfile

import rarfile

rf = rarfile.RarFile('/content/drive/MyDrive/Deep Learning for Perception/Flights Data.rar')
rf.extractall('/content/')




In [None]:
!pip install python-docx
import docx
import json
import pandas as pd
import os

# Function to read a single .docx file and return a DataFrame
def read_docx_file(file_path):
    # Load the .docx file
    doc = docx.Document(file_path)

    # Extract text from the document and combine it into a single JSON string
    json_data = ''
    for paragraph in doc.paragraphs:
        if paragraph.text:
            json_data += paragraph.text

    # Parse the combined JSON string into Python objects
    try:
        flight_data = json.loads(json_data)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON from {file_path}: {e}")
        return pd.DataFrame()  # Return empty DataFrame if parsing fails

    # Convert the list of JSON objects into a pandas DataFrame
    return pd.json_normalize(flight_data)

# Function to process all .docx files in a directory
def process_all_docx_files(directory):
    all_data = []

    # Loop through all .docx files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")

            # Read each .docx file and append the DataFrame to the list
            df = read_docx_file(file_path)
            all_data.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)

    return combined_df

# Specify the directory containing the .docx files
docx_directory = '/content/Flights Data'

# Process all .docx files in the specified directory
combined_df = process_all_docx_files(docx_directory)

# Display the combined DataFrame
print(combined_df.head())

# Save the combined DataFrame to a CSV file (optional)
combined_df.to_csv('combined_flight_data.csv', index=False)


Processing file: /content/Flights Data/2.docx
Processing file: /content/Flights Data/71.docx
Processing file: /content/Flights Data/33.docx
Processing file: /content/Flights Data/12.docx
Processing file: /content/Flights Data/48.docx
Processing file: /content/Flights Data/24.docx
Processing file: /content/Flights Data/40.docx
Processing file: /content/Flights Data/50.docx
Processing file: /content/Flights Data/34.docx
Processing file: /content/Flights Data/63.docx
Processing file: /content/Flights Data/67.docx
Processing file: /content/Flights Data/59.docx
Processing file: /content/Flights Data/58.docx
Processing file: /content/Flights Data/1.docx
Processing file: /content/Flights Data/14.docx
Processing file: /content/Flights Data/51.docx
Processing file: /content/Flights Data/16.docx
Processing file: /content/Flights Data/70.docx
Processing file: /content/Flights Data/10.docx
Processing file: /content/Flights Data/41.docx
Processing file: /content/Flights Data/7.docx
Processing file:

In [None]:
combined_df['status']

Unnamed: 0,status
0,active
1,active
2,active
3,active
4,active
...,...
81387,active
81388,active
81389,active
81390,unknown


In [None]:
# combined_df.isna().sum()

In [None]:
df = combined_df[combined_df['status'] == 'active']
# print(df.head())


In [None]:
df

Unnamed: 0,type,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,arrival.iataCode,arrival.icaoCode,arrival.terminal,...,codeshared.airline.iataCode,codeshared.airline.icaoCode,codeshared.flight.number,codeshared.flight.iataNumber,codeshared.flight.icaoNumber,departure.gate,arrival.gate,arrival.actualTime,arrival.estimatedRunway,arrival.actualRunway
0,departure,active,lhe,opla,m,2023-08-01t08:00:00.000,2023-08-01t08:00:00.000,jed,oejn,h,...,,,,,,,,,,
1,departure,active,lhe,opla,m,2023-08-01t09:00:00.000,2023-08-01t09:00:00.000,khi,opkc,m,...,,,,,,,,,,
2,departure,active,lhe,opla,m,2023-08-01t09:10:00.000,,jed,oejn,,...,,,,,,,,,,
3,departure,active,lhe,opla,m,2023-08-01t09:15:00.000,2023-08-01t09:21:00.000,uet,opqt,,...,,,,,,,,,,
4,departure,active,lhe,opla,,2023-08-01t09:50:00.000,2023-08-01t09:50:00.000,urc,zwww,3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81386,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t04:59:00.000,auh,omaa,a,...,ey,etd,222,ey222,etd222,26,a8a,,,
81387,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t04:59:00.000,auh,omaa,a,...,,,,,,26,a8a,,,
81388,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t05:15:00.000,dxb,omdb,2,...,fz,fdb,332,fz332,fdb332,,,,,
81389,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t05:15:00.000,dxb,omdb,2,...,,,,,,,,,,


In [None]:
df.isna().sum()

Unnamed: 0,0
type,0
status,0
departure.iataCode,0
departure.icaoCode,0
departure.terminal,31988
departure.scheduledTime,0
departure.estimatedTime,6348
arrival.iataCode,0
arrival.icaoCode,0
arrival.terminal,45084


In [None]:

df.head(1)

Unnamed: 0,type,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,arrival.iataCode,arrival.icaoCode,arrival.terminal,...,codeshared.airline.iataCode,codeshared.airline.icaoCode,codeshared.flight.number,codeshared.flight.iataNumber,codeshared.flight.icaoNumber,departure.gate,arrival.gate,arrival.actualTime,arrival.estimatedRunway,arrival.actualRunway
0,departure,active,lhe,opla,m,2023-08-01t08:00:00.000,2023-08-01t08:00:00.000,jed,oejn,h,...,,,,,,,,,,


In [None]:
import pandas as pd

# Assuming your DataFrame is named df
columns_to_remove = [
    'departure.actualTime',
    'departure.estimatedRunway',
    'departure.actualRunway',
    'arrival.estimatedTime',
    'arrival.delay',
    'arrival.baggage',
    'codeshared.airline.name',
    'codeshared.airline.iataCode',
    'codeshared.airline.icaoCode',
    'codeshared.flight.number',
    'codeshared.flight.iataNumber',
    'codeshared.flight.icaoNumber',
    'departure.gate',
    'arrival.gate',
    'arrival.actualTime',
    'arrival.estimatedRunway',
    'arrival.actualRunway'
]

df = df.drop(columns=columns_to_remove)


In [None]:
df.columns

Index(['type', 'status', 'departure.iataCode', 'departure.icaoCode',
       'departure.terminal', 'departure.scheduledTime',
       'departure.estimatedTime', 'arrival.iataCode', 'arrival.icaoCode',
       'arrival.terminal', 'arrival.scheduledTime', 'airline.name',
       'airline.iataCode', 'airline.icaoCode', 'flight.number',
       'flight.iataNumber', 'flight.icaoNumber', 'departure.delay'],
      dtype='object')

In [None]:
# df

In [None]:
new_columns = [col.replace('.', '_') for col in df.columns]
df.columns = new_columns
# df.head()


In [None]:
# Get the number of rows and columns
num_rows = df.shape[0]
num_cols = df.shape[1]

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 75897
Number of columns: 18


In [None]:
# Calculate the mean of the 'departure_delay' column, excluding NaN values
mean_departure_delay = df['departure_delay'].mean()

# Fill NaN values in 'departure_delay' with the calculated mean
df['departure_delay'].fillna(mean_departure_delay, inplace=True)


# Check if there are any more NaN values in the 'departure_delay' column
print(df['departure_delay'].isna().sum())


0


In [None]:
target = df['departure_delay']

In [None]:
df = df.drop(columns=['departure_delay'])


In [None]:
target.head(100)

Unnamed: 0,departure_delay
0,33.804466
1,33.804466
2,33.804466
3,6.000000
4,7.000000
...,...
96,10.000000
97,10.000000
98,33.804466
99,33.804466


In [None]:
# Binning the target variable into 8 bins
target = pd.cut(target, bins=8, labels=False)


In [None]:
target

Unnamed: 0,departure_delay
0,0
1,0
2,0
3,0
4,0
...,...
81386,0
81387,0
81388,0
81389,0


In [None]:
print(target.unique())

[0 1 2 5 3 4 7]


In [None]:
print(target.isin([0]).all())

False


In [None]:
df.isna().sum()

Unnamed: 0,0
type,0
status,0
departure_iataCode,0
departure_icaoCode,0
departure_terminal,31988
departure_scheduledTime,0
departure_estimatedTime,6348
arrival_iataCode,0
arrival_icaoCode,0
arrival_terminal,45084


In [None]:
# Alternatively, you can fill with a specific value like 'Unknown'
df['departure_terminal'].fillna('Unknown', inplace=True)
df['arrival_terminal'].fillna('Unknown', inplace=True)

# Check if there are any more NaN values in the specified columns
print(df['departure_terminal'].isna().sum())
print(df['arrival_terminal'].isna().sum())


0
0


In [None]:
# Count the number of rows where 'departure_terminal' is 'Unknown'
departure_unknown_count = df[df['departure_terminal'] == 'Unknown'].shape[0]

# Count the number of rows where 'arrival_terminal' is 'Unknown'
arrival_unknown_count = df[df['arrival_terminal'] == 'Unknown'].shape[0]

print(f"Number of rows with 'departure_terminal' as 'Unknown': {departure_unknown_count}")
print(f"Number of rows with 'arrival_terminal' as 'Unknown': {arrival_unknown_count}")


Number of rows with 'departure_terminal' as 'Unknown': 31988
Number of rows with 'arrival_terminal' as 'Unknown': 45084


In [None]:
df

Unnamed: 0,type,status,departure_iataCode,departure_icaoCode,departure_terminal,departure_scheduledTime,departure_estimatedTime,arrival_iataCode,arrival_icaoCode,arrival_terminal,arrival_scheduledTime,airline_name,airline_iataCode,airline_icaoCode,flight_number,flight_iataNumber,flight_icaoNumber
0,departure,active,lhe,opla,m,2023-08-01t08:00:00.000,2023-08-01t08:00:00.000,jed,oejn,h,2023-08-01t10:45:00.000,pakistan international airlines,pk,pia,859,pk859,pia859
1,departure,active,lhe,opla,m,2023-08-01t09:00:00.000,2023-08-01t09:00:00.000,khi,opkc,m,2023-08-01t10:55:00.000,airblue,pa,abq,401,pa401,abq401
2,departure,active,lhe,opla,m,2023-08-01t09:10:00.000,,jed,oejn,Unknown,2023-08-01t16:05:00.000,serene air,er,sep,821,er821,sep821
3,departure,active,lhe,opla,m,2023-08-01t09:15:00.000,2023-08-01t09:21:00.000,uet,opqt,Unknown,2023-08-01t10:35:00.000,pakistan international airlines,pk,pia,322,pk322,pia322
4,departure,active,lhe,opla,Unknown,2023-08-01t09:50:00.000,2023-08-01t09:50:00.000,urc,zwww,3,2023-08-01t15:45:00.000,china southern airlines,cz,csn,6018,cz6018,csn6018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81386,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t04:59:00.000,auh,omaa,a,2024-06-01t06:05:00.000,klm,kl,klm,3920,kl3920,klm3920
81387,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t04:59:00.000,auh,omaa,a,2024-06-01t06:05:00.000,etihad airways,ey,etd,222,ey222,etd222
81388,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t05:15:00.000,dxb,omdb,2,2024-06-01t06:10:00.000,emirates,ek,uae,2109,ek2109,uae2109
81389,departure,active,khi,opkc,m,2024-06-01t04:55:00.000,2024-06-01t05:15:00.000,dxb,omdb,2,2024-06-01t06:10:00.000,flydubai,fz,fdb,332,fz332,fdb332


In [None]:
columns_to_remove = ['departure_scheduledTime', 'departure_estimatedTime', 'arrival_scheduledTime']
df = df.drop(columns=columns_to_remove)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Iterate through the columns of the DataFrame
for column in df.columns:
  # Check if the column's data type is object (categorical)
  if df[column].dtype == object:
    # Fit and transform the column using LabelEncoder
    df[column] = le.fit_transform(df[column])

# Print the updated DataFrame with label-encoded values
print(df.head())


   type  status  departure_iataCode  departure_icaoCode  departure_terminal  \
0     0       0                   2                   2                   4   
1     0       0                   2                   2                   4   
2     0       0                   2                   2                   4   
3     0       0                   2                   2                   4   
4     0       0                   2                   2                   1   

   arrival_iataCode  arrival_icaoCode  arrival_terminal  airline_name  \
0                43                31                 7            63   
1                47                52                 9             9   
2                43                31                 5            77   
3                88                56                 5            63   
4                89                94                 2            20   

   airline_iataCode  airline_icaoCode  flight_number  flight_iataNumber  \
0          

In [None]:
(df['type'] != 0).sum()

0

**Models**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# SVM class with fit and predict methods
class SVM:
    def __init__(self, kernel='linear', C=1.0, max_iter=1000):
        self.C = C  # Regularization parameter
        self.kernel_type = kernel
        self.max_iter = max_iter

    def linear_kernel(self, X1, X2):
        return np.dot(X1, X2.T)

    def polynomial_kernel(self, X1, X2, d=3):
        return (1 + np.dot(X1, X2.T)) ** d

    def rbf_kernel(self, X1, X2, gamma=0.5):
        sq_dist = np.sum(X1**2, axis=1).reshape(-1, 1) + np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)
        return np.exp(-gamma * sq_dist)

    def kernel(self, X1, X2):
        if self.kernel_type == 'linear':
            return self.linear_kernel(X1, X2)
        elif self.kernel_type == 'polynomial':
            return self.polynomial_kernel(X1, X2)
        elif self.kernel_type == 'rbf':
            return self.rbf_kernel(X1, X2)
        else:
            raise ValueError("Unknown kernel")

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.alpha = np.zeros(n_samples)
        self.b = 0
        K = self.kernel(X, X)

        for _ in range(self.max_iter):
            for i in range(n_samples):
                condition = y[i] * (np.dot(self.alpha * y, K[i]) + self.b)
                if condition < 1:
                    self.alpha[i] += self.C
                    self.b += y[i]

        self.support_vectors_ = X[self.alpha > 1e-5]

    def predict(self, X):
        K = self.kernel(X, self.support_vectors_)
        return np.sign(np.dot(self.alpha, K) + self.b)

# Logistic Regression from scratch
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.weights = np.zeros(self.n)
        self.bias = 0

        for _ in range(self.iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / self.m) * np.dot(X.T, (y_predicted - y))
            db = (1 / self.m) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

# Decision Tree from scratch
class DecisionTreeScratch:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth

    def entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return -np.sum(prob * np.log2(prob))

    def split(self, X, y, feature_index, threshold):
        left = np.where(X[:, feature_index] <= threshold)
        right = np.where(X[:, feature_index] > threshold)
        return X[left], X[right], y[left], y[right]

    def best_split(self, X, y):
        best_feature, best_threshold, best_info_gain = None, None, -float('inf')
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self.split(X, y, feature_index, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                current_info_gain = self.information_gain(y, y_left, y_right)
                if current_info_gain > best_info_gain:
                    best_feature = feature_index
                    best_threshold = threshold
                    best_info_gain = current_info_gain
        return best_feature, best_threshold

    def information_gain(self, parent, left, right):
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        gain = self.entropy(parent) - (weight_left * self.entropy(left) + weight_right * self.entropy(right))
        return gain

    def build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or depth == self.max_depth:
            return np.argmax(np.bincount(y))

        feature_index, threshold = self.best_split(X, y)
        if feature_index is None:
            return np.argmax(np.bincount(y))

        X_left, X_right, y_left, y_right = self.split(X, y, feature_index, threshold)

        left_subtree = self.build_tree(X_left, y_left, depth+1)
        right_subtree = self.build_tree(X_right, y_right, depth+1)
        return (feature_index, threshold, left_subtree, right_subtree)

    def predict_single(self, x, tree):
        if not isinstance(tree, tuple):
            return tree
        feature_index, threshold, left_subtree, right_subtree = tree
        if x[feature_index] <= threshold:
            return self.predict_single(x, left_subtree)
        else:
            return self.predict_single(x, right_subtree)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict(self, X):
        return [self.predict_single(x, self.tree) for x in X]



# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

**Decision Tree**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Fit the models
tree = DecisionTreeScratch(max_depth=3)
tree.fit(X_train.values, y_train.values)

y_pred_tree = tree.predict(X_test.values)


# Evaluate the models
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_tree)}")
print(f"Decision Tree F1 Score: {f1_score(y_test, y_pred_tree, average='weighted')}")
f1 = f1_score(y_test, y_pred_tree, average='weighted')
print(f"Decision Tree F1 Score: {f1}")

Decision Tree Accuracy: 0.9849143610013176
Decision Tree F1 Score: 0.9774288680316419
Decision Tree F1 Score: 0.9774288680316419


**Logistic Regression**

In [None]:
# Fit the models
log_reg = LogisticRegressionScratch(learning_rate=0.01, iterations=1000)
log_reg.fit(X_train.values, y_train.values)

y_pred_logreg = log_reg.predict(X_test.values)


# Evaluate the models
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print(f"Logistic Regression F1 Score: {f1_score(y_test, y_pred_logreg, average='weighted')}")
f1 = f1_score(y_test, y_pred_logreg, average='weighted')
print(f"Decision Tree F1 Score: {f1}")

**SVM**

In [None]:
# Fit the models
svm = SVM(kernel='linear')
svm.fit(X_train.values, y_train.values)

y_pred_svm = svm.predict(X_test.values)

# Evaluate the models
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(f"SVM F1 Score: {f1_score(y_test, y_pred_svm, average='weighted')}")
f1 = f1_score(y_test, y_pred_svm, average='weighted')
print(f"Decision Tree F1 Score: {f1}")

**Ensembling**

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression as MetaLogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Function to perform stacking on already trained models
def stacking_ensemble(X_train, X_test, y_train, y_test,
                      svm_model, log_reg_model, tree_model):
    # Get predictions from base models on training data
    y_pred_svm_train = svm_model.predict(X_train)
    y_pred_logreg_train = log_reg_model.predict(X_train)
    y_pred_tree_train = tree_model.predict(X_train)

    # Get predictions from base models on test data
    y_pred_svm_test = svm_model.predict(X_test)
    y_pred_logreg_test = log_reg_model.predict(X_test)
    y_pred_tree_test = tree_model.predict(X_test)

    # Stack the predictions as input features for the meta-model
    X_meta_train = np.column_stack((y_pred_svm_train, y_pred_logreg_train, y_pred_tree_train))
    X_meta_test = np.column_stack((y_pred_svm_test, y_pred_logreg_test, y_pred_tree_test))

    # Meta-model (Logistic Regression)
    meta_model = MetaLogisticRegression()
    meta_model.fit(X_meta_train, y_train)

    # Meta-model predictions
    y_pred_meta = meta_model.predict(X_meta_test)

    # Evaluate the ensemble model
    accuracy = accuracy_score(y_test, y_pred_meta)
    f1 = f1_score(y_test, y_pred_meta, average='weighted')

    print(f"Stacking Ensemble Accuracy: {accuracy}")
    print(f"Stacking Ensemble F1 Score: {f1}")

    return y_pred_meta

# Example usage: assuming X_train, X_test, y_train, y_test, and your trained models (svm_model, log_reg_model, tree_model)
# Replace these variables with your actual data and models

y_pred_ensemble = stacking_ensemble(X_train, X_test, y_train, y_test,
                                    y_pred_svm, y_pred_logreg, y_pred_tree)
