## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

### 1.1 Helper functions

#### For correlation between categorical features

In [None]:
def categorical_correlations(df, categorical_features, target):
    """
    Compute correlation metrics between categorical features and a target variable,
    and return results as a pandas DataFrame.

    Parameters:
    - df: Pandas DataFrame containing the dataset.
    - categorical_features: List of strings representing the names of categorical features.
    - target: Name of the column in df representing the target variable.

    Returns:
    - correlations_df: Pandas DataFrame where rows represent features, and columns are
                       correlation metrics ('p_value', 'cramers_score', 'mutual info').
    """
    # Initialize an empty list to collect dictionaries of metrics
    correlations = []

    for feature in categorical_features:
        # Compute correlation metrics for the current feature
        metrics = {
            'Feature': feature,
            'p_value': p_value(df, target, feature),
            'cramers_score': cramers_score(df, target, feature),
            'mutual info': mutual_info(df, target, feature)
        }
        # Append the metrics dictionary to the list
        correlations.append(metrics)

    # Convert the list of dictionaries to a DataFrame
    correlations_df = pd.DataFrame(correlations)

    return correlations_df

In [None]:
from scipy.stats import chi2_contingency

def p_value(df, target, feature):
    contingency_table = pd.crosstab(df[target], df[feature])

    # Perform the Chi-Square test
    chi2, p, dof, ex = chi2_contingency(contingency_table)

    return p

In [None]:
from scipy.stats import chi2_contingency

def cramers_score(df, target, feature):
    contingency_table = pd.crosstab(df[target], df[feature])
    chi2, p, dof, ex = chi2_contingency(contingency_table)

    n = contingency_table.sum().sum()
    cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

    return cramers_v

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

def mutual_info(df, target, feature):
    encoder_feature = LabelEncoder()
    encoded_feature = encoder_feature.fit_transform(df[target])

    encoder_target = LabelEncoder()
    encoded_target = encoder_target.fit_transform(df[feature])

    mi = mutual_info_classif(encoded_feature.reshape(-1, 1), encoded_target)

    return mi[0]

### For bivariate analysis [categorical vs categorical]

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.graphics.mosaicplot import mosaic

# Bivariate Analysis: Categorical vs Categorical
def bivariate_categorical_analysis(df, cat_var1, cat_var2):
    # Cross-tabulation
    crosstab = pd.crosstab(df[cat_var1], df[cat_var2])
    print(f"Crosstab between {cat_var1} and {cat_var2}:\n", crosstab)

    # Chi-Square Test
    chi2, p, dof, expected = chi2_contingency(crosstab)
    print(f"Chi-Square Test between {cat_var1} and {cat_var2}:\nChi2: {chi2}, p-value: {p}")

    # Stacked Bar Chart
    crosstab.plot(kind='bar', stacked=True)
    plt.title(f"Stacked Bar Chart between {cat_var1} and {cat_var2}")
    plt.xlabel(cat_var1)
    plt.ylabel("Count")
    plt.show()

    # Heatmap
    sns.heatmap(crosstab, annot=True, cmap='coolwarm', fmt='d')
    plt.title(f"Heatmap of {cat_var1} vs {cat_var2}")
    plt.xlabel(cat_var2)
    plt.ylabel(cat_var1)
    plt.show()

    # Mosaic Plot
#     plt.figure(figsize=(10, 6))
#     mosaic(df, [cat_var1, cat_var2], title=f"Mosaic Plot of {cat_var1} vs {cat_var2}")
#     plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic

# Function to perform bivariate analysis on all pairs of categorical variables
def bivariate_categorical_analysis_all_pairs(df, categorical_features):
    
    # Iterate through all pairs of categorical columns
    for i in range(len(categorical_features)):
        for j in range(i + 1, len(categorical_features)):
            cat_var1 = categorical_features[i]
            cat_var2 = categorical_features[j]
            
            # Cross-tabulation
            crosstab = pd.crosstab(df[cat_var1], df[cat_var2])
            print(f"Crosstab between {cat_var1} and {cat_var2}:\n", crosstab)
            
            # Chi-Square Test
            chi2, p, dof, expected = chi2_contingency(crosstab)
            print(f"Chi-Square Test between {cat_var1} and {cat_var2}:\nChi2: {chi2}, p-value: {p}\n")
            
            # Heatmap
            sns.heatmap(crosstab, annot=True, cmap='coolwarm', fmt='d')
            plt.title(f"Heatmap of {cat_var1} vs {cat_var2}")
            plt.xlabel(cat_var2)
            plt.ylabel(cat_var1)
            plt.show()
            
            # Mosaic Plot
#             plt.figure(figsize=(10, 6))
#             mosaic(df, [cat_var1, cat_var2], title=f"Mosaic Plot of {cat_var1} vs {cat_var2}")
#             plt.show()

#### For plotting countplots (for categorical variables)

In [None]:
def cat_countplot(df, feature):
    plt.figure(figsize=(10, 7))
    sns.countplot(x=feature, data=df)
    plt.show()

## 2. Read Training and Test Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv', index_col='id')

In [None]:
features = list(test.columns)

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
len(test) / (len(test) + len(train))

## Target Distribution

In [None]:
value_counts = train['Target'].value_counts()

plt.figure(figsize=(10, 7))
plt.pie(value_counts, labels=value_counts.index, autopct="%.0f%%")
plt.title('Target Distribution')
plt.show()

## Label Encode the Target Variables

In [None]:
label_encoder = LabelEncoder()
targets = label_encoder.fit_transform(train['Target'])

In [None]:
features

In [None]:
train['Marital status']

In [None]:
categorical_features = ['Marital status', 'Application mode', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']

for feature in categorical_features:
    data_type = pd.CategoricalDtype(categories=list(set(train[feature]) | set(test[feature])))
    for df in [train, test]:
        df[feature] = df[feature].astype(data_type)

In [None]:
train['Marital status']

In [None]:
train['Application order']

In [None]:
numeric_features = [feature for feature in features if feature not in categorical_features + ['Application order', "Mother's occupation", "Father's occupation"]]
numeric_features

In [None]:
categorical_features

## Let's study the categorical variables first

In [None]:
categorical_features

### Cardinality

In [None]:
for col in categorical_features + ['Application order', "Mother's occupation", "Father's occupation"]:
    print(f"{col} has {train[col].nunique()} unique values")

### Bivariate analysis

In [None]:
for feature in categorical_features + ['Application order', "Mother's occupation", "Father's occupation"]:
    bivariate_categorical_analysis(train, feature, 'Target')

In [None]:
categorical_correlations(train, categorical_features + ['Application order', "Mother's occupation", "Father's occupation"], 'Target').sort_values(by='p_value')

'International' and 'Educational special needs' have p-values of 0.73 and 0.89!

In [None]:
categorical_correlations(train, categorical_features + ['Application order', "Mother's occupation", "Father's occupation"], 'Target').sort_values(by='cramers_score', 
                                                                                                                                                  ascending=False)

'Nacionality', 'International', 'Education special needs' are again at the bottom

In [None]:
bivariate_categorical_analysis_all_pairs(train, categorical_features + ['Application order', "Mother's occupation", "Father's occupation"])

In [None]:
categorical_correlations(train, categorical_features + ['Application order', "Mother's occupation", "Father's occupation"], 'Target').sort_values(by='mutual info',
                                                                                                                                                  ascending=False)

- Again the three culprits are at the bottom
- Course, Tuition Fees, Scholarship holder appear to be strong indicators

## 1. Marital status

In [None]:
cat_countplot(train, 'Marital status')

In [None]:
cat_countplot(test, 'Marital status')

### 2. Application mode

In [None]:
cat_countplot(train, 'Application mode')

In [None]:
cat_countplot(test, 'Application mode')

### 3. Course

In [None]:
cat_countplot(train, 'Course')

In [None]:
cat_countplot(test, 'Course')

### 4. Daytime/evening attendance

In [None]:
cat_countplot(train, 'Daytime/evening attendance')

In [None]:
cat_countplot(test, 'Daytime/evening attendance')

0 is under-represented.

### 5. Previous qualification

In [None]:
cat_countplot(train, 'Previous qualification')

In [None]:
cat_countplot(test, 'Previous qualification')

### 6. Nacionality

In [None]:
cat_countplot(train, 'Nacionality')

In [None]:
cat_countplot(test, 'Nacionality')

In [None]:
train['Nacionality'].value_counts()

In [None]:
test['Nacionality'].value_counts()

### 7. Mother's qualification

In [None]:
cat_countplot(train, "Mother's qualification")

In [None]:
cat_countplot(test, "Mother's qualification")

### 8. Displaced

In [None]:
cat_countplot(train, 'Displaced')

In [None]:
cat_countplot(test, 'Displaced')

### 9. Educational special needs

In [None]:
cat_countplot(train, 'Educational special needs')

In [None]:
cat_countplot(test, 'Educational special needs')

In [None]:
train['Educational special needs'].value_counts()

In [None]:
test['Educational special needs'].value_counts()

### 10. Debtor

In [None]:
cat_countplot(train, 'Debtor')

In [None]:
cat_countplot(test, 'Debtor')

### 11. Tuition fees up to date

In [None]:
cat_countplot(train, 'Tuition fees up to date')

In [None]:
cat_countplot(test, 'Tuition fees up to date')

### 12. Gender

In [None]:
cat_countplot(train, 'Gender')

In [None]:
cat_countplot(test, 'Gender')

### 13. Scholarship holder

In [None]:
cat_countplot(train, 'Scholarship holder')

In [None]:
cat_countplot(test, 'Scholarship holder')

### 14. International

In [None]:
cat_countplot(train, 'International')

In [None]:
cat_countplot(test, 'International')

In [None]:
train['International'].value_counts()

In [None]:
test['International'].value_counts()

### 15. Application order

In [None]:
cat_countplot(train,'Application order')

In [None]:
cat_countplot(test, 'Application order')

### 16. Mother's occupation

In [None]:
cat_countplot(train, "Mother's occupation")

In [None]:
cat_countplot(test, "Mother's occupation")

### 17. Father's occupation

In [None]:
cat_countplot(train, "Father's occupation")

In [None]:
cat_countplot(test, "Father's occupation")

In [None]:
['Application order', "Mother's occupation", "Father's occupation"]