In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Loading Data

In [None]:
df = pd.read_csv('airline_passenger_satisfaction.csv')
pd.set_option('display.max_columns', None)  # показує всі колонки
df.head(10)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

## Data Cleaning

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Arrival Delay']

In [None]:
df['Arrival Delay'].mean()

In [None]:
df['Arrival Delay'].fillna(df['Arrival Delay'].mean(), inplace=True)

In [None]:
df.isnull().sum()

## Charts

In [None]:
plt.pie(df['Satisfaction'].value_counts(), labels=['Neutral or Dissatisfied', 'Satisfied'], autopct='%1.1f%%')
plt.show()

In [None]:
cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Satisfaction']
plt.figure(figsize=(15, 15))
for i, col in enumerate(cols):
    plt.subplot(3, 2, i + 1)
    sns.countplot(x=col, data=df)
plt.show()

In [None]:
df.hist(bins=20, figsize=(20, 20), color='green')
plt.show()

## Column Data Encoding

In [None]:
df.select_dtypes(include='object').columns

In [None]:
df['Gender'].unique()

In [None]:
df['Customer Type'].unique()

In [None]:
df['Type of Travel'].unique()

In [None]:
df['Class'].unique()

In [None]:
df['Satisfaction'].unique()

In [None]:
# Варіант 1
label_encoder = LabelEncoder()

columns = df.select_dtypes(include='object').drop(columns='Satisfaction').columns

for column in columns:
    df[column] = label_encoder.fit_transform(df[column])

df.head()

In [None]:
# Варіант 2
# df.replace({
#     'Gender': {
#         'Male': 1,
#         'Female': 2
#     },
#     'Customer Type': {
#       'First-time': 1,
#       'Returning': 2
#     },
#     'Type of Travel': {
#         'Business': 1,
#         'Personal': 2
#     },
#     'Class': {
#         'Business': 1,
#         'Economy': 2,
#         'Economy Plus': 3
#     }
# }, inplace=True)
# df.head()

In [None]:
df.dtypes

## Additional Charts

In [None]:
plt.figure(figsize=(16, 8))
sns.heatmap(df.drop(columns='Satisfaction').corr(), annot=True, fmt='.2f', cmap='Greens')
plt.show()

In [None]:
sns.catplot(data=df, x='Age', height=4, aspect=4, kind='count', hue='Satisfaction', order=range(7, 73))
plt.show()

In [None]:
sns.catplot(data=df, x='On-board Service', height=4, aspect=4, kind='count', hue='Satisfaction')
plt.show()

In [None]:
sns.catplot(data=df, x='Gender', height=4, aspect=4, kind='count', hue='Satisfaction')
plt.show()

## Filtering Data

In [None]:
df[['Gender', 'Age', 'Type of Travel']].head()

In [None]:
df.loc[2:5, ['Gender', 'Age', 'Flight Distance']]

In [None]:
df.loc[df['Age'] >50, ['Gender', 'Age', 'Flight Distance']].head()

In [None]:
df.loc[df['Gender'] == 1, ['Gender', 'Age', 'Flight Distance']].head()

In [None]:
df.iloc[10:15, 1:7].head()

In [None]:
df.columns

In [None]:
df.index

## Models

In [None]:
X = df.drop(columns='Satisfaction')
X.head(3)

In [None]:
y = df['Satisfaction']
y.head(3)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
predictions

In [337]:
model_score = accuracy_score(y_test, predictions)
model_score

0.821835540498922

### Random Forest

In [338]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions

model_score = accuracy_score(y_test, predictions)
model_score

0.9663150600554358

### KNeighborsClassifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# model = KNeighborsClassifier()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# model.fit(X_train, y_train)

# predictions = model.predict(X_test)
# predictions

# model_score = accuracy_score(y_test, predictions)
# model_score

### Logistic Regression

In [336]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# model = LogisticRegression(max_iter=1000)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# model.fit(X_train, y_train)

# predictions = model.predict(X_test)
# predictions

# model_score = accuracy_score(y_test, predictions)
# model_score

## Prediction without Voting columns

In [340]:
X = df[['Gender', 'Age', 'Customer Type', 'Type of Travel', 'Class', 'Flight Distance', 'Departure Delay', 'Arrival Delay']]
X.head()

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay
0,1,48,0,0,0,821,2,5.0
1,0,35,1,0,0,821,26,39.0
2,1,41,1,0,0,853,0,0.0
3,1,50,1,0,0,1905,0,0.0
4,0,49,1,0,0,3470,0,1.0


In [342]:
y = df['Satisfaction']
y.head()

0    Neutral or Dissatisfied
1                  Satisfied
2                  Satisfied
3                  Satisfied
4                  Satisfied
Name: Satisfaction, dtype: object

In [343]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions

model_score = accuracy_score(y_test, predictions)
model_score

0.7704804434862951

In [344]:
X.iloc[:0].to_dict()

{'Gender': {},
 'Age': {},
 'Customer Type': {},
 'Type of Travel': {},
 'Class': {},
 'Flight Distance': {},
 'Departure Delay': {},
 'Arrival Delay': {}}

In [351]:
test_inputs = {
 'Gender': [1, 0],
 'Age': [35, 25],
 'Customer Type': [0, 1],
 'Type of Travel': [0, 0],
 'Class': [1, 1],
 'Flight Distance': [1200, 600],
 'Departure Delay': [0, 5],
 'Arrival Delay': [0, 5]
}

test_df = pd.DataFrame(test_inputs)
test_df

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay
0,1,35,0,0,1,1200,0,0
1,0,25,1,0,1,600,5,5


In [352]:
model.predict(test_df)

array(['Neutral or Dissatisfied', 'Neutral or Dissatisfied'], dtype=object)

## Saving Prediction Model

In [355]:
import joblib
joblib.dump(model, 'airline_passenger_satisfaction.joblib')

['airline_passenger_satisfaction.joblib']

In [357]:
test_inputs = {
 'Gender': [1, 0],
 'Age': [35, 25],
 'Customer Type': [0, 1],
 'Type of Travel': [0, 0],
 'Class': [1, 1],
 'Flight Distance': [1200, 600],
 'Departure Delay': [0, 5],
 'Arrival Delay': [0, 5]
}

test_df = pd.DataFrame(test_inputs)
test_df

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay
0,1,35,0,0,1,1200,0,0
1,0,25,1,0,1,600,5,5


In [358]:
trained_model = joblib.load('airline_passenger_satisfaction.joblib')
trained_model.predict(test_df)

array(['Neutral or Dissatisfied', 'Neutral or Dissatisfied'], dtype=object)