In [2]:
import pandas as pd
import plotly.express as px
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
survey_data= pd.read_excel("survey data airline.xlsx")
survey_data.head(1)

Unnamed: 0,Company Name,Age,City,Destination,Take off time?,The period of delay? If found,Was it checked due to the delay?,Reasons for flight delay?,Access time?,How satisfied are you?
0,Fly Nas,18-30,القصيم,مكة,11:00:00,لايوجد,لا,لم يتم ذكر الأسباب,01:30:00,رائع


# clean & process

In [4]:
survey= survey_data.copy()

In [5]:
survey.isnull().sum()
survey=survey.dropna()

In [6]:
survey['Take off time?'] = pd.to_datetime(survey['Take off time?'], format='%H:%M:%S')
survey['Access time?'] = pd.to_datetime(survey['Access time?'], format='%H:%M:%S')
survey["Was it checked due to the delay?"]=survey["Was it checked due to the delay?"].replace({"لا":0, "نعم":1})
survey["The period of delay? If found"]=survey["The period of delay? If found"].replace({"لايوجد":1,
                                   "نصف ساعة إلى ساعة":2,
                                   "ساعتان إلى أربع ساعات":3, 
                                   "أكثر من أربع ساعات":4, 
                                   "تأجيل الرحلة إلى يوم اخر":5})

survey['Take off Hour'] = pd.to_datetime(survey['Take off time?']).dt.hour
survey['Access Minute'] = pd.to_datetime(survey['Access time?']).dt.minute
survey['Access Hour'] = pd.to_datetime(survey['Access time?']).dt.hour
survey['Take off Minute'] = pd.to_datetime(survey['Take off time?']).dt.minute
survey['Company Name']=survey['Company Name'].replace({"Fly Nas":1,
                                   "Saudi Airline":2,
                                   "Fly Adeal":3, 
                                   "SaudiGulf Airline":4})


  survey["Was it checked due to the delay?"]=survey["Was it checked due to the delay?"].replace({"لا":0, "نعم":1})
  survey["The period of delay? If found"]=survey["The period of delay? If found"].replace({"لايوجد":1,
  survey['Company Name']=survey['Company Name'].replace({"Fly Nas":1,


# ML

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# RandomForst

In [8]:
X = survey[['Take off Hour','Access Hour']]
y = survey["Was it checked due to the delay?"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model1 = RandomForestClassifier()
model1.fit(X_train, y_train.ravel())

y_pred = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5862068965517241


  model1.fit(X_train, y_train.ravel())


# GridSearchCV tech for modeling

In [9]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

#  الهيبرباراميتر 
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# select stratuge of evaluation
cv = 3

#  Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

#preduction
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# evaluation model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.5737069947596263
Accuracy: 0.6379310344827587


# RandomForestClassifier = Random search tech

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

#  الهيبرباراميتر 
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = 3

#  Random Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, cv=cv, scoring='accuracy', verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10}
Best Score: 0.5781499202551834
Accuracy: 0.5689655172413793


# SVM 

In [11]:
from sklearn.svm import SVC
X=survey[["Access Minute","Take off Hour"]]
y=survey['Company Name']
# Assuming 'X' contains your feature variables and 'y' contains the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Training the SVM Model
svm_model = SVC(kernel='rbf', gamma='auto')  # You can choose different kernel functions based on your data
svm_model.fit(X_train, y_train)

# 4. Model Evaluation
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.7241379310344828


# LogisticRegression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# تحديد المتغيرات المستقلة (الميزات) والمتغير المعتمد (الهدف)
X=survey[["Access Hour","Take off Hour"]]
y=survey['Company Name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7241379310344828


# KNeighborsClassifier

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X=survey[["Take off Hour"]]
y=survey['Company Name']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsClassifier(n_neighbors=5)  

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.603448275862069


In [15]:
survey.to_excel("survey.xlsx",index=False)