In [None]:
import pandas as pd
import numpy as np

df=pd.read_csv('train.csv')
df

In [None]:
df['satisfaction'].value_counts()

In [None]:
df['Customer Type'].value_counts()

In [None]:
dfSatisfied=df[df['satisfaction'] == 'satisfied']
dfDisatisfied=df[df['satisfaction'] == 'neutral or dissatisfied']

In [None]:
dfSatisfied.describe()

In [None]:
dfDisatisfied.describe()

In [None]:
dfSatisfied

In [None]:
def clean_airloops_data(df):
    # 1. Handle Missing Values: Fill 'Arrival Delay in Minutes' with the median
    median_delay = df['Arrival Delay in Minutes'].median()
    df['Arrival Delay in Minutes'].fillna(median_delay, inplace=True)

    # Note: If 'Inflight wifi service' has nulls (which it often does),
    # we can fill those with 0, assuming 'No Service' or 'Not Applicable'.
    if 'Inflight wifi service' in df.columns and df['Inflight wifi service'].isnull().any():
        df['Inflight wifi service'].fillna(0, inplace=True)

    # 2. Drop Unnecessary Columns
    cols_to_drop = ['Unnamed: 0']
    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # 3. Create the Target Variable (satisfaction_binary)
    # Using .map() is a simple, clear way to do binary encoding.
    satisfaction_map = {'satisfied': 1, 'neutral or dissatisfied': 0}
    df['satisfaction_binary'] = df['satisfaction'].map(satisfaction_map)
    df.drop(columns=['satisfaction'], inplace=True)

    # 4. Format Column Names (Lowercase and Underscore)
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    print("\nData cleaning complete. Ready for analysis!")
    return df

# Apply the function to your DataFrame
df_cleaned = clean_airloops_data(df.copy())

In [None]:
# 1. Check Class Distribution (How many passengers in each class?)
print("--- Class Distribution ---")
print(df_cleaned['class'].value_counts())
print("-" * 30)

# 2. Calculate Mean Satisfaction by Class
# The mean of 'satisfaction_binary' is the percentage of satisfied customers (1s)
satisfaction_by_class = df_cleaned.groupby('class')['satisfaction_binary'].mean().sort_values(ascending=False)

print("\n--- Average Satisfaction by Class ---")
print(satisfaction_by_class)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# The results you provided, stored in a DataFrame for plotting
satisfaction_data = {
    'Class': ['Business', 'Eco Plus', 'Eco'],
    'Average_Satisfaction': [0.694251, 0.246064, 0.186138]
}
plot_df = pd.DataFrame(satisfaction_data)

# Sort the data for a cleaner visual presentation (highest to lowest)
plot_df = plot_df.sort_values(by='Average_Satisfaction', ascending=False)

# Create the plot
plt.figure(figsize=(8, 5))
bars = plt.bar(plot_df['Class'], plot_df['Average_Satisfaction'], color=['#1f77b4', '#ff7f0e', '#2ca02c']) # Custom colors for visual separation

# Add labels and title
plt.title('Average Passenger Satisfaction by Class', fontsize=14)
plt.xlabel('Travel Class', fontsize=12)
plt.ylabel('Average Satisfaction (Percentage)', fontsize=12)
plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0%', '20%', '40%', '60%', '80%', '100%']) # Format y-axis as percentages

# Add the percentage values on top of the bars for clarity
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f'{yval:.1%}', ha='center', va='bottom', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()

# Save the plot
plt.savefig('satisfaction_by_class_bar_chart.png')
print("Plot saved as 'satisfaction_by_class_bar_chart.png'")

In [None]:
#cleaned DataFrame is named 'df_cleaned'

# 1. Check Customer Type Distribution
print("--- Customer Type Distribution ---")
print(df_cleaned['customer_type'].value_counts())
print("-" * 30)

# 2. Calculate Mean Satisfaction by Customer Type
# The mean of 'satisfaction_binary' is the percentage of satisfied customers (1s)
satisfaction_by_customer_type = df_cleaned.groupby('customer_type')['satisfaction_binary'].mean().sort_values(ascending=False)

print("\n--- Average Satisfaction by Customer Type ---")
print(satisfaction_by_customer_type)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# The results you provided, stored in a DataFrame for plotting
satisfaction_data = {
    'Customer_Type': ['Loyal Customer', 'Disloyal Customer'],
    'Average_Satisfaction': [0.477291, 0.236658]
}
plot_df_h2 = pd.DataFrame(satisfaction_data)

# Sort the data for presentation
plot_df_h2 = plot_df_h2.sort_values(by='Average_Satisfaction', ascending=False)

# Create the plot
plt.figure(figsize=(8, 5))
bars = plt.bar(plot_df_h2['Customer_Type'], plot_df_h2['Average_Satisfaction'], color=['#3a5e8c', '#a63a50'])

# Add labels and title
plt.title('Average Passenger Satisfaction: Loyal vs. Disloyal', fontsize=14)
plt.xlabel('Customer Type', fontsize=12)
plt.ylabel('Average Satisfaction (Percentage)', fontsize=12)
plt.yticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5], ['0%', '10%', '20%', '30%', '40%', '50%'])

# Add the percentage values on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f'{yval:.1%}', ha='center', va='bottom', fontsize=10)

plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()

# Save the plot
plt.savefig('satisfaction_by_customer_type_bar_chart.png')
print("Plot saved as 'satisfaction_by_customer_type_bar_chart.png'")

In [None]:
# Assuming your cleaned DataFrame is named 'df_cleaned'

# 1. Identify Service Columns (The columns with rating scores or delay metrics)
# We include all rating columns and the delay columns.
service_columns = [
    'inflight_wifi_service',
    'departure/arrival_time_convenient',
    'ease_of_online_booking',
    'gate_location',
    'food_and_drink',
    'online_boarding',
    'seat_comfort',
    'inflight_entertainment',
    'on-board_service',
    'leg_room_service',
    'baggage_handling',
    'checkin_service',
    'inflight_service',
    'cleanliness',
    'departure_delay_in_minutes',
    'arrival_delay_in_minutes'
]

# 2. Calculate Correlation with satisfaction_binary and sort the results
# The Pearson correlation coefficient (r) is used here.
correlation_results = df_cleaned[service_columns + ['satisfaction_binary']].corr()['satisfaction_binary'].sort_values(ascending=False)

# Remove the correlation of the target variable with itself (which is always 1)
correlation_results = correlation_results.drop('satisfaction_binary')

print("--- Correlation of Service Features with Satisfaction (r value) ---")
print(correlation_results)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# The results you provided, stored in a Series for plotting
data = {
    'online_boarding': 0.503557,
    'inflight_entertainment': 0.398059,
    'seat_comfort': 0.349459,
    'on-board_service': 0.322383,
    'leg_room_service': 0.313131,
    'cleanliness': 0.305198,
    'inflight_wifi_service': 0.284245,
    'baggage_handling': 0.247749,
    'inflight_service': 0.244741,
    'checkin_service': 0.236174,
    'food_and_drink': 0.209936,
    'ease_of_online_booking': 0.171705,
    'gate_location': 0.000682,
    'departure_delay_in_minutes': -0.050494,
    'departure/arrival_time_convenient': -0.051601,
    'arrival_delay_in_minutes': -0.057435
}

correlation_results = pd.Series(data)

# Create the plot
plt.figure(figsize=(10, 7))

# Define colors: Green for positive correlation, Red for negative
colors = ['g' if x > 0 else 'r' for x in correlation_results]

# Plot the horizontal bar chart
correlation_results.plot(kind='barh', color=colors)

# Add labels and title
plt.title('Correlation (r) of Service Features with Passenger Satisfaction', fontsize=14)
plt.xlabel('Pearson Correlation Coefficient (r)', fontsize=12)
plt.ylabel('Service Feature', fontsize=12)

# Add a vertical line at r=0 for reference
plt.axvline(0, color='gray', linestyle='--', linewidth=0.8)

plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()

# Save the plot
plt.savefig('service_correlation_bar_chart.png')
print("Plot saved as 'service_correlation_bar_chart.png'")



In [None]:
def analyze_service_correlations_notebook(df):
    """
    Calcula el coeficiente de correlación de Pearson (r) entre
    las columnas de servicio y la variable objetivo satisfaction_binary.
    """
    # Lista de columnas de servicio que ya usaste en tu análisis H3
    service_columns = [
        'inflight_wifi_service',
        'departure/arrival_time_convenient',
        'ease_of_online_booking',
        'gate_location',
        'food_and_drink',
        'online_boarding',
        'seat_comfort',
        'inflight_entertainment',
        'on-board_service',
        'leg_room_service',
        'baggage_handling',
        'checkin_service',
        'inflight_service',
        'cleanliness',
        'departure_delay_in_minutes',
        'arrival_delay_in_minutes'
    ]

    # Calcular la correlación
    correlation_results = df[service_columns + ['satisfaction_binary']].corr()['satisfaction_binary'].sort_values(ascending=False)

    # Eliminar la correlación de la variable objetivo consigo misma
    correlation_results = correlation_results.drop('satisfaction_binary', errors='ignore')
    
    return correlation_results

In [None]:
# --- 1. Filtrar Data por Clase ---

# Filtrar para Clase Economy
df_eco = df_cleaned[df_cleaned['class'] == 'Eco'].copy()
print(f"Pasajeros Clase Eco para análisis: {len(df_eco)}")

# Filtrar para Clase Eco Plus
df_ecoplus = df_cleaned[df_cleaned['class'] == 'Eco Plus'].copy()
print(f"Pasajeros Clase Eco Plus para análisis: {len(df_ecoplus)}")

print("-" * 30)

# --- 2. Calcular Correlaciones Segmentadas ---

# Análisis de Correlación para CLASE ECO
corr_eco = analyze_service_correlations_notebook(df_eco)
print("\n--- TOP 5 IMPULSORES para CLASE ECO ---")
print(corr_eco.head(5))

print("-" * 30)

# Análisis de Correlación para CLASE ECO PLUS
corr_ecoplus = analyze_service_correlations_notebook(df_ecoplus)
print("\n--- TOP 5 IMPULSORES para CLASE ECO PLUS ---")
print(corr_ecoplus.head(5))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os # Import the os library

# --- Data Compilation ---
general_corr = {
    'online_boarding': 0.504,
    'inflight_entertainment': 0.398,
    'seat_comfort': 0.349,
    'inflight_wifi_service': 0.284, 
    'on-board_service': 0.322
}
eco_corr = {
    'inflight_wifi_service': 0.467,
    'online_boarding': 0.316,
    'ease_of_online_booking': 0.233,
    'inflight_entertainment': 0.182,
    'food_and_drink': 0.141
}
ecoplus_corr = {
    'inflight_wifi_service': 0.495,
    'online_boarding': 0.348,
    'inflight_entertainment': 0.328,
    'cleanliness': 0.256,
    'food_and_drink': 0.255
}

combined_df = pd.DataFrame({
    'General': general_corr,
    'Eco': eco_corr,
    'Eco Plus': ecoplus_corr
}).fillna(0).sort_values(by='Eco Plus', ascending=True)

# --- Plotting Setup ---
fig, ax = plt.subplots(figsize=(12, 8))

bar_height = 0.25
y_pos = np.arange(len(combined_df.index))

# Plot bars for the three segments
ax.barh(y_pos + bar_height, combined_df['General'], bar_height, label='General (All Classes)', color='#007AA3', alpha=0.7)
ax.barh(y_pos, combined_df['Eco'], bar_height, label='Eco', color='#FF7F0E', alpha=0.8)
ax.barh(y_pos - bar_height, combined_df['Eco Plus'], bar_height, label='Eco Plus', color='#2CA02C')

# Labels and titles
ax.set_yticks(y_pos)
ax.set_yticklabels(combined_df.index.str.replace('_', ' ').str.title())
ax.set_xlabel('Pearson Correlation Coefficient (r)', fontsize=12)
ax.set_title('Segmented Investment Strategy: Correlation of Drivers by Class', fontsize=16, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(axis='x', linestyle='--', alpha=0.5)

plt.tight_layout()

# --- FIX: Create the directory if it doesn't exist ---
output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Now save the file
plt.savefig(os.path.join(output_dir, 'segmented_investment_comparison.png'))
plt.show()

print("Segmented Investment Drivers Comparison chart generated and successfully saved to 'images/segmented_investment_comparison.png'.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Data Compilation ---
# Sample Sizes (Weights)
n_eco = 46745
n_ecoplus = 7494
n_total_eco_segment = n_eco + n_ecoplus

# 1. General Correlation (Top 5 from your H3 analysis)
general_corr = {
    'online_boarding': 0.504,
    'inflight_entertainment': 0.398,
    'seat_comfort': 0.349,
    'inflight_wifi_service': 0.284, 
    'on-board_service': 0.322
}

# 2. ECO and ECO PLUS RAW CORRELATIONS
eco_corr_raw = {
    'inflight_wifi_service': 0.467, 'online_boarding': 0.316, 
    'ease_of_online_booking': 0.233, 'inflight_entertainment': 0.182, 
    'food_and_drink': 0.141, 'cleanliness': 0.09 # Adding cleanliness for weighted average
}
ecoplus_corr_raw = {
    'inflight_wifi_service': 0.495, 'online_boarding': 0.348, 
    'inflight_entertainment': 0.328, 'cleanliness': 0.256, 
    'food_and_drink': 0.255, 'ease_of_online_booking': 0.18
}

# 3. CALCULATE THE WEIGHTED AVERAGE for the combined 'Economy Segment'
combined_drivers = {}
all_drivers_keys = set(eco_corr_raw.keys()) | set(ecoplus_corr_raw.keys())

for driver in all_drivers_keys:
    # Get correlation and fill NaN with 0 for drivers not in the segment
    r_eco = eco_corr_raw.get(driver, 0)
    r_ecoplus = ecoplus_corr_raw.get(driver, 0)
    
    # Weighted Average Formula: (r_eco * n_eco + r_ecoplus * n_ecoplus) / n_total
    weighted_r = (r_eco * n_eco + r_ecoplus * n_ecoplus) / n_total_eco_segment
    combined_drivers[driver] = weighted_r

# Select the top 5 for the final chart for clarity
combined_drivers_series = pd.Series(combined_drivers).sort_values(ascending=False).head(5).to_dict()

# --- 4. Final DataFrame for Plotting (General vs. Combined Economy) ---
final_combined_df = pd.DataFrame({
    'General (All Classes)': general_corr,
    'Economy Segment (Eco + Eco Plus)': combined_drivers_series
}).fillna(0).sort_values(by='Economy Segment (Eco + Eco Plus)', ascending=True)

# --- Plotting Setup ---
fig, ax = plt.subplots(figsize=(10, 7))

bar_height = 0.35
y_pos = np.arange(len(final_combined_df.index))

# Plot bars for the two segments
ax.barh(y_pos + bar_height/2, final_combined_df['General (All Classes)'], bar_height, label='General (All Classes)', color='#007AA3', alpha=0.7)
ax.barh(y_pos - bar_height/2, final_combined_df['Economy Segment (Eco + Eco Plus)'], bar_height, label='Economy Segment (Strategic Focus)', color='#DAA520')

# Labels and titles
ax.set_yticks(y_pos)
ax.set_yticklabels(final_combined_df.index.str.replace('_', ' ').str.title())
ax.set_xlabel('Pearson Correlation Coefficient (r)', fontsize=12)
ax.set_title('Investment Strategy: General vs. Combined Economy Segment Drivers', fontsize=14, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(axis='x', linestyle='--', alpha=0.5)

plt.tight_layout()

# --- SAVING THE FILE ---
output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
plt.savefig(os.path.join(output_dir, 'simplified_segmented_investment_comparison.png'))
plt.show()

print("Simplified Segmented Investment Drivers Comparison chart code generated.")
print("\n--- Top 5 Drivers for the Combined Economy Segment ---")
print(pd.Series(combined_drivers_series))