In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Airlines Flights Dataset for Different Cities**<br>
    The Flights Booking Dataset of various Airlines is a scraped datewise from a famous website in a structured format. The dataset contains the records of flight travel details between the cities in India. Here, multiple features are present like Source & Destination City, Arrival & Departure Time, Duration & Price of the flight etc.
    This data is available as a CSV file. We are going to analyze this data set using the Pandas DataFrame.
    This analyse will be helpful for those working in Airlines, Travel domain.

## **These are the main Features/Columns available in the dataset :**

1) Airline: The name of the airline company is stored in the airline column. It is a categorical feature having 6 different airlines.

2) Flight: Flight stores information regarding the plane's flight code. It is a categorical feature.

3) Source City: City from which the flight takes off. It is a categorical feature having 6 unique cities.

4) Departure Time: This is a derived categorical feature obtained created by grouping time periods into bins. It stores information about the departure time and have 6 unique time labels.

5) Stops: A categorical feature with 3 distinct values that stores the number of stops between the source and destination cities.

6) Arrival Time: This is a derived categorical feature created by grouping time intervals into bins. It has six distinct time labels and keeps information about the arrival time.

7) Destination City: City where the flight will land. It is a categorical feature having 6 unique cities.

8) Class: A categorical feature that contains information on seat class; it has two distinct values: Business and Economy.

9) Duration: A continuous feature that displays the overall amount of time it takes to travel between cities in hours.

10) Days Left: This is a derived characteristic that is calculated by subtracting the trip date by the booking date.

11) Price: Target variable stores information of the ticket price.

In [None]:
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
df="/kaggle/input/airlines-flights-data/airlines_flights_data.csv"

In [None]:
airline_df=pd.read_csv(df)
airline_df.sample(7)

In [None]:
airline_df.shape

## **EDA and Analysis**

In [None]:
airline_df.info()

## **Descriptive Statistics**

In [None]:
def descriptive_analysis(dataset:pd.DataFrame,n_cols):
    stats_df=dataset[n_cols].describe(percentiles=[0.25,0.5,0.75,0.95,0.99])

    stats_df.loc["skew"]=dataset[n_cols].skew() #adding skewness
    stats_df.loc["kurtosis"]=dataset[n_cols].kurtosis() #adding kurtosis

    #Detect outlier(IQR method)
    q1= dataset[n_cols].quantile(0.25)
    q3=dataset[n_cols].quantile(0.75)
    IQR=q3 - q1

    outlier_dect=((dataset[n_cols] < (q1 -1.5*IQR)) | (dataset[n_cols] > (q3 + 1.5 * IQR)))
    outlier_count = outlier_dect.sum().rename('outlier_count')
    stats_df =pd.concat([stats_df,outlier_count.to_frame().T])
    return stats_df 
num_cols =["index","duration","days_left","price"]
num_stats=descriptive_analysis(airline_df,num_cols)
print(num_stats)

## **Distribution & Outlier Visualization**

In [None]:
def plot_numerical_distribution(df,num_cols):
    for col in num_cols:
        plt.figure(figsize=(12,5))

        #hist +kde
        plt.subplot(1,2,1)
        sns.histplot(df[col],kde =True,bins=50)
        plt.title(f"Distribution of {col}")

        #outlier
        plt.subplot(1,2,2)
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot  of {col}")
        plt.tight_layout()
        plt.show()
plot_numerical_distribution(airline_df,num_cols)

## **Correlation Analysis**

In [None]:
def plot_Correlation_matrix(df,n_cols):
    corr_m=df[n_cols].corr(method="spearman")
    plt.figure(figsize=(10,8))
    sns.heatmap(corr_m,annot=True,cmap="coolwarm",center=0,fmt='.2f')
    plt.title("Correlation Matrix")
    plt.show()
plot_Correlation_matrix(airline_df,num_cols)

## **Frequency Analysis**

In [None]:
def Freq_analysis(df:pd.DataFrame,cat_columns,top_n:int=50):
    for col in cat_columns:
        freq=df[col].value_counts().nlargest(top_n)

        #Plot
        plt.figure(figsize=(10,5))
        freq.plot(kind="bar")
        plt.title(f"Top {top_n} categories in {col}")
        plt.xticks(rotation=90)
        plt.show()
cat_cols=airline_df.select_dtypes(include=["object"]).columns
Freq_analysis(airline_df,cat_cols)

## **Cardinality & Rare Categories**

In [None]:
def analyze_cardinality(df,cat_cols,rare_threshold=0.01):
    cardinality_report={}
    for col in cat_cols:
        n_unique=df[col].nunique()
        freq=df[col].value_counts(normalize=True)
        rare_cats=freq[freq <rare_threshold].index.tolist()
        cardinality_report[col]={
            'N_unique': n_unique,
            "Rare_categories":rare_cats,
            "Sugesstion":"Drop or bin rare categories" if rare_cats else "OK"
        }
    return pd.DataFrame(cardinality_report).T
# cat_cols=["airline","flight","source_city","departure_time","stops","arrival_time","destination_city","class"]
cardinality_report=analyze_cardinality(airline_df,cat_cols)
print(cardinality_report)

## **Chi-Square Test (Categorical Associations)**

In [None]:
def chi_square_test(df,cat1,cat2):
    contingency_table=pd.crosstab(df[cat1],df[cat2])
    chi2,p,dof,expected=chi2_contingency(contingency_table)
    n=contingency_table.sum().sum()
    cramers_v =np.sqrt(chi2/n*(min(contingency_table.shape)-1))
    return {
        "Chi2_stat":chi2,
        "p_value":p,
        "Cramers_v": cramers_v,
        "interpretation":"Strong association" if cramers_v >0.25 else "Weak/moderate"
    }
result=chi_square_test(airline_df,"airline","source_city")#checking relation between airlines and source_city
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","flight")#checking relation between airlines and flight
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","departure_time")#checking relation between airlines and departure_time
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","stops")#checking relation between airlines and stops
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","arrival_time")#checking relation between airlines and arrival_time
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","destination_city")#checking relation between airlines and destination_city
print(result)

In [None]:
result=chi_square_test(airline_df,"airline","class")#checking relation between airlines and class
print(result)

## **Combined Analysis**

In [None]:
def plot_grouped_stats(df,num_col,cat_col):
    grouped=df.groupby(cat_col)[num_col].agg(['mean',"median","std","count"])

    plt.figure(figsize=(12,6))
    sns.barplot(x=grouped.index,y=grouped["mean"])
    plt.title(f"Mean {num_col} by {cat_col}")
    plt.xticks(rotation=90)
    plt.show()
    return grouped
plot_grouped_stats(airline_df,"duration","airline")

## **Not completed (on process)**