In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

df=pd.read_csv('train.csv')
df

In [None]:
# Assuming your DataFrame is named 'df'
print("--- DataFrame Information (Column Names, Non-Null Count, and Dtype) ---")
df.info()

In [None]:
print("\n--- List of Column Names (Original) ---")
print(df.columns.tolist())

In [None]:
print("\n--- Data Types (Dtypes) per Column ---")
print(df.dtypes)

In [None]:
df.columns = df.columns.str.strip()

# Convert Target Variable to Numerical 
df['is_satisfied'] = df['satisfaction'].map({
    'satisfied': 1,
    'neutral or dissatisfied': 0
})

# Identify numerical columns for cleaning and scaling
numerical_cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

# CRITICAL FIX: Handle NaN values BEFORE scaling, using the recommended pattern.
# We will use the median for imputation.
print("--- Fixing Missing Values ---")
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        # FIX: Assign the result of fillna() back to the column, removing inplace=True
        df[col] = df[col].fillna(median_value)
        print(f"Imputed {col} with the median ({median_value}).")

In [None]:
print("\n--- 3. Numerical Columns After Scaling (MinMaxScaler) ---")

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
# Since NaNs were fixed above, this should now run without issues.
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print(df[numerical_cols].head())

In [None]:
# A. Correlation with Satisfaction (Pandas)
print("\n--- 4. Correlation with satisfaction ---")

# Select the numerical and ordinal columns, plus the target variable
correlation_cols = [
    'is_satisfied', 'Age', 'Flight Distance', 
    'Inflight wifi service', 'Cleanliness', 'Departure Delay in Minutes'
] 

# Calculate the correlation matrix for selected columns
correlation_matrix = df[correlation_cols].corr()

# Print the correlation of each factor with 'is_satisfied'
print(correlation_matrix['is_satisfied'].sort_values(ascending=False))

In [None]:
# B. T-test on Delays (SciPy Stats)
satisfied_delay = df[df['is_satisfied'] == 1]['Arrival Delay in Minutes']
dissatisfied_delay = df[df['is_satisfied'] == 0]['Arrival Delay in Minutes']

# Perform independent two-sample T-test
t_stat, p_value = stats.ttest_ind(satisfied_delay, dissatisfied_delay, equal_var=False)

print("\n--- T-test (Satisfied vs. Dissatisfied Arrival Delay) ---")
print(f"T-Statistic: {t_stat:.2f}")
print(f"P-Value: {p_value:.5f}")
print("Conclusion: P-Value < 0.05 suggests a statistically significant difference in mean delay.")

In [None]:
# Configure plot style
sns.set_style("whitegrid")
plt.figure(figsize=(18, 6))

In [None]:
# Create a bar plot
# FIX: Explicitly set hue=satisfaction_by_loyalty.index and legend=False
sns.barplot(
    x=satisfaction_by_loyalty.index, 
    y=satisfaction_by_loyalty.values * 100, 
    hue=satisfaction_by_loyalty.index, # FIX applied here
    palette="viridis", 
    legend=False                     # FIX applied here
)

# Set the title and labels
plt.title('Satisfaction Rate by Customer Type', fontsize=14)
plt.xlabel('Customer Type', fontsize=12)
plt.ylabel('Mean Satisfaction Rate (%)', fontsize=12)

plt.xticks(rotation=15, ha='right') 
plt.ylim(0, satisfaction_by_loyalty.values.max() * 100 * 1.1)

plt.tight_layout()
plt.show()

In [None]:
# B. Heatmap: Feature Correlation 
plt.subplot(1, 3, 2)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Feature Correlation Heatmap')

In [None]:
# C. KDE Plot: Delay Distribution by Satisfaction 
plt.subplot(1, 3, 3)
sns.kdeplot(data=df, x='Departure Delay in Minutes', hue='satisfaction', fill=True, common_norm=False)
plt.title('Departure Delay Distribution by satisfaction (Scaled)')
plt.xlim(-0.1, 0.5) 
plt.legend(title='satisfaction', labels=df['satisfaction'].unique().tolist())

plt.tight_layout()
plt.show()

In [None]:
max_scaled_delay = df['Departure Delay in Minutes'].max()

# Define new bins using scaled values
# Using small fractions (0.0001, 0.01) that represent small delays in the scaled [0, 1] range
delay_bins_fixed = [-1e-5, 0.0001, 0.01, max_scaled_delay] 
delay_labels_fixed = ['No Delay', 'Minor Delay (Scaled)', 'Major Delay (Scaled)']

# Apply the fixed bins
df['Departure Delay Category'] = pd.cut(
    df['Departure Delay in Minutes'], 
    bins=delay_bins_fixed, 
    labels=delay_labels_fixed,
    right=True,
    include_lowest=True
)

# Group by the new category and calculate mean satisfaction (%)
# Note: observed=True is used to suppress the FutureWarning
satisfaction_by_delay_category = df.groupby('Departure Delay Category', observed=True)['is_satisfied'].mean() * 100

print("Calculated Satisfaction Rates by Delay Category:")
print(satisfaction_by_delay_category)

In [None]:
plt.figure(figsize=(8, 5))
sns.set_style("whitegrid")

# Create a bar plot using the index (categories) and values (percentages)
sns.barplot(
    x=satisfaction_by_delay_category.index, 
    y=satisfaction_by_delay_category.values, 
    # Use the index for hue to color bars based on the category (fixes FutureWarning)
    hue=satisfaction_by_delay_category.index,
    palette="magma",
    legend=False
)

# Set the title and labels
plt.title('Satisfaction Rate by Departure Delay Severity', fontsize=14)
plt.xlabel('Departure Delay Category', fontsize=12)
plt.ylabel('Mean Satisfaction Rate (%)', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=15, ha='right') 

plt.tight_layout()
plt.show()

In [None]:
# Define the obstacles and their estimated difficulty scores (1-5)
obstacles_data = {
    'Obstacle': [
        'Handling Ordinal vs. Categorical Variables',
        'Missing Data Imputation Strategy',
        'Outliers in Delay Columns',
        'Inconsistent Column Names and Spacing',
        'Data Type Coercion and Errors'
    ],
    'Difficulty_Score': [5, 4, 4, 3, 3] # Scores are subjective for visualization
}

# Create the DataFrame
df_obstacles = pd.DataFrame(obstacles_data)

print("--- List of Data Cleaning Obstacles ---")
# Print only the list of obstacle names
print(df_obstacles['Obstacle'].tolist())

In [None]:
# Create a DataFrame for the obstacles and their subjective difficulty scores (1-5)
data = {
    'Obstacle': [
        'Inconsistent Column Names and Spacing',
        'Missing Data Imputation Strategy',
        'Data Type Coercion and Errors',
        'Handling Ordinal vs. Categorical Variables',
        'Outliers in Delay Columns'
    ],
    'Difficulty Score (1-5)': [3, 4, 3, 5, 4]
}
df_obstacles = pd.DataFrame(data)

# Sort the data by Difficulty Score for a clearer bar chart
df_obstacles = df_obstacles.sort_values(by='Difficulty Score (1-5)', ascending=False)

# --- Visualization ---
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Create the bar plot
ax = sns.barplot(
    x='Difficulty Score (1-5)',
    y='Obstacle',
    data=df_obstacles,
    palette='Reds_d',
    hue='Obstacle',  # Use hue to color each bar uniquely (resolves FutureWarning)
    legend=False
)

# Set the title and labels
plt.title('Perceived Difficulty of Data Cleaning Obstacles', fontsize=16)
plt.xlabel('Difficulty Score (1 = Low, 5 = High)', fontsize=12)
plt.ylabel('') # Obstacle names are on the y-axis

# Ensure labels are visible
plt.xticks(range(1, 6))

plt.tight_layout()
plt.savefig('cleaning_obstacles_barchart.png')
print("Bar chart saved as cleaning_obstacles_barchart.png")