Badanie satysfakcji klientow linii lotniczych.

W naszym zbiorze danych posiadamy informacje odnosnie roznych czynnikow, ktore mogly miec wplyw na zadowolenie klientow.

Dane, na ktorych pracujemy, pochodza stad: https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
---


In [84]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder


In [85]:
df = pd.read_csv('train.csv', delimiter=',', low_memory=False)
df_test = pd.read_csv('test.csv', delimiter=',', low_memory=False)
print(df.size)
df = df.iloc[:60000,:]
print(df.size)

2597600
1500000


In [86]:
df.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)

In [87]:
df.drop(["unnamed:_0", "id"], axis=1)

Unnamed: 0,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Male,Loyal Customer,45,Business travel,Eco,1592,1,5,2,5,...,1,2,5,2,4,3,1,0,0.0,neutral or dissatisfied
59996,Male,Loyal Customer,26,Business travel,Business,2161,5,4,4,4,...,5,3,2,4,2,3,5,0,0.0,neutral or dissatisfied
59997,Female,Loyal Customer,26,Personal Travel,Eco,293,2,1,2,4,...,2,4,5,3,4,3,2,11,18.0,neutral or dissatisfied
59998,Male,Loyal Customer,17,Business travel,Eco Plus,405,5,5,5,5,...,5,4,3,1,3,3,5,0,0.0,satisfied


In [88]:
df['arrival_delay_in_minutes'] = df['arrival_delay_in_minutes'].fillna(15.178)
df['delay_sum'] = df['departure_delay_in_minutes'] + df['arrival_delay_in_minutes']
df.drop(["arrival_delay_in_minutes", "departure_delay_in_minutes"], axis=1)

Unnamed: 0,unnamed:_0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure/arrival_time_convenient,...,seat_comfort,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,satisfaction,delay_sum
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,5,4,3,4,4,5,5,neutral or dissatisfied,43.0
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,1,5,3,1,4,1,neutral or dissatisfied,7.0
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,5,4,3,4,4,4,5,satisfied,0.0
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,2,5,3,1,4,2,neutral or dissatisfied,20.0
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,5,3,3,4,4,3,3,3,satisfied,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,59995,70248,Male,Loyal Customer,45,Business travel,Eco,1592,1,5,...,1,1,2,5,2,4,3,1,neutral or dissatisfied,0.0
59996,59996,58710,Male,Loyal Customer,26,Business travel,Business,2161,5,4,...,5,5,3,2,4,2,3,5,neutral or dissatisfied,0.0
59997,59997,5295,Female,Loyal Customer,26,Personal Travel,Eco,293,2,1,...,2,2,4,5,3,4,3,2,neutral or dissatisfied,29.0
59998,59998,103751,Male,Loyal Customer,17,Business travel,Eco Plus,405,5,5,...,4,5,4,3,1,3,3,5,satisfied,0.0


In [89]:
columns_to_sum = [
    'inflight_wifi_service', 'departure/arrival_time_convenient', 'ease_of_online_booking',
    'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
    'inflight_entertainment', 'on-board_service', 'leg_room_service', 'baggage_handling',
    'checkin_service', 'inflight_service', 'cleanliness'
]

df['survey_sum'] = df[columns_to_sum].sum(axis=1)

In [90]:
df = df.loc[df['flight_distance'] >= 31]
df = df.loc[df['flight_distance'] <= 3748]

df = df.loc[df['delay_sum'] >= 0]
df = df.loc[df['delay_sum'] <= 144]

In [91]:
transform = ColumnTransformer(
    transformers= [
      ('minMax', MinMaxScaler(), ["delay_sum", "survey_sum"]),
      ('standardScaler', StandardScaler(), ["age", "flight_distance"]),
      ('dummy', OneHotEncoder(), ["customer_type", "type_of_travel", "class", "gender"])
    ]

)

In [92]:
def split(df):
  x = df[["customer_type", "type_of_travel", "class", "gender","age", "flight_distance", "delay_sum", "survey_sum"]]
  y = df['satisfaction']
  return x, y

In [93]:
X, y = split(df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [94]:
pipeline = Pipeline(
    [
        ('preprocessor', transform),
        ('classifier', RandomForestClassifier(random_state=42))
    ]
)



pip = pipeline.fit(X_train, y_train)

In [95]:
y_pred = pip.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Macierz pomyłek:")
print(conf_matrix)
# Wyświetlenie raportu klasyfikacji
print(classification_report(y_test, y_pred))

Macierz pomyłek:
[[5677  635]
 [1077 3717]]
                         precision    recall  f1-score   support

neutral or dissatisfied       0.84      0.90      0.87      6312
              satisfied       0.85      0.78      0.81      4794

               accuracy                           0.85     11106
              macro avg       0.85      0.84      0.84     11106
           weighted avg       0.85      0.85      0.84     11106



In [96]:
joblib.dump(pip, 'suml_model.pkl')

['suml_model.pkl']