Badanie satysfakcji klientow linii lotniczych.

W naszym zbiorze danych posiadamy informacje odnosnie roznych czynnikow, ktore mogly miec wplyw na zadowolenie klientow.

Dane, na ktorych pracujemy, pochodza stad: https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
---


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/train.csv", delimiter=',', low_memory=False)
df_test = pd.read_csv(r"/content/drive/MyDrive/test_set.csv", delimiter=',', low_memory=False)

In [None]:
df.drop(["id", "col_0"], axis=1)

Unnamed: 0,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25.0,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1.0,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0.0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11.0,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0.0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,Female,disloyal Customer,23,Business travel,Eco,192,2,1,2,3,...,2,3,1,4,2,3,2,3.0,0.0,neutral or dissatisfied
103900,Male,Loyal Customer,49,Business travel,Business,2347,4,4,4,4,...,5,5,5,5,5,5,4,0.0,0.0,satisfied
103901,Male,disloyal Customer,30,Business travel,Business,1995,1,1,1,3,...,4,3,2,4,5,5,4,7.0,14.0,neutral or dissatisfied
103902,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,1,5,...,1,4,5,1,5,4,1,0.0,0.0,neutral or dissatisfied


In [None]:
df['arrival_delay_in_minutes'] = df['arrival_delay_in_minutes'].fillna(15.178)
df['delay_sum'] = df['departure_delay_in_minutes'] + df['arrival_delay_in_minutes']
df.drop(["arrival_delay_in_minutes", "departure_delay_in_minutes"], axis=1)

Unnamed: 0,col_0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,...,seat_comfort,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,satisfaction,delay_sum
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,5,4,3,4,4,5,5,neutral or dissatisfied,43.0
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,1,5,3,1,4,1,neutral or dissatisfied,7.0
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,5,4,3,4,4,4,5,satisfied,0.0
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,2,5,3,1,4,2,neutral or dissatisfied,20.0
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,5,3,3,4,4,3,3,3,satisfied,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,...,2,2,3,1,4,2,3,2,neutral or dissatisfied,3.0
103900,103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,...,5,5,5,5,5,5,5,4,satisfied,0.0
103901,103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,...,5,4,3,2,4,5,5,4,neutral or dissatisfied,21.0
103902,103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,...,1,1,4,5,1,5,4,1,neutral or dissatisfied,0.0


In [None]:
columns_to_sum = [
    'inflight_wifi_service', 'departure_arrival_time_convenient', 'ease_of_online_booking',
    'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
    'inflight_entertainment', 'on_board_service', 'leg_room_service', 'baggage_handling',
    'checkin_service', 'inflight_service', 'cleanliness'
]

df['survey_sum'] = df[columns_to_sum].sum(axis=1)

In [None]:
df = df.loc[df['flight_distance'] >= 31]
df = df.loc[df['flight_distance'] <= 3748]

df = df.loc[df['delay_sum'] >= 0]
df = df.loc[df['delay_sum'] <= 144]

In [None]:
transform = ColumnTransformer(
    transformers= [
      ('minMax', MinMaxScaler(), ["delay_sum", "survey_sum"]),
      ('standardScaler', StandardScaler(), ["age", "flight_distance"]),
      ('dummy', OneHotEncoder(), ["customer_type", "type_of_travel", "class", "gender"])
    ]

)

In [None]:



def split(df):
  x = df[["customer_type", "type_of_travel", "class", "gender","age", "flight_distance", "delay_sum", "survey_sum"]]
  y = df['satisfaction']
  return x, y

In [None]:
X, y = split(df)
S
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
pipeline = Pipeline(
    [
        ('preprocessor', transform),
        ('classifier', RandomForestClassifier(random_state=42))
    ]
)

pip = pipeline.fit(X_train, y_train)

In [None]:
y_pred = pip.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Macierz pomyłek:")
print(conf_matrix)
# Wyświetlenie raportu klasyfikacji
print(classification_report(y_test, y_pred))

Macierz pomyłek:
[[9881 1140]
 [1754 6447]]
                         precision    recall  f1-score   support

neutral or dissatisfied       0.85      0.90      0.87     11021
              satisfied       0.85      0.79      0.82      8201

               accuracy                           0.85     19222
              macro avg       0.85      0.84      0.84     19222
           weighted avg       0.85      0.85      0.85     19222



In [None]:
joblib.dump(pip, 'suml_model.pkl')

['suml_model.pkl']