# 1. Data
This section covers data selection, distribution, and correlation analysis as per ReadMe requirement.

## 1.a Data Selection Approach
The dataset consists of academic performance records. For FIS modeling, 20 records per class (100 total) are sampled. For ANFIS, 10,000 records per class (50,000 total) are used.

## 1.b Data Distribution of the Employed Set

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- NUCLEAR DATA CLEANING ---
# 1. Read without header to see raw structure
df_raw = pd.read_excel('academicPerformanceData.xlsx', header=None)

# 2. Search for the real header row
header_idx = None
for i, row in df_raw.head(50).iterrows():
    row_str = row.astype(str).str.lower().to_list()
    if 'remarks' in row_str or 'x7' in row_str:
        header_idx = i
        break

if header_idx is None: header_idx = 0

# 3. Reload with correct header
df = pd.read_excel('academicPerformanceData.xlsx', header=header_idx)

# 4. Clean Column Names
df.columns = df.columns.astype(str).str.strip().str.lower()

# 5. Keep ONLY critical columns
expected_cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'remarks']
cols_to_keep = [c for c in expected_cols if c in df.columns]
df = df[cols_to_keep]

# 6. Force Numeric Coercion
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 7. Drop Garbage Rows
df.dropna(subset=['remarks'], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Data Loaded & Cleaned.")
print(df.info())
print(df.head())

## 1.b.iii Visualization of Data Distributions\nBoxplots and Violin plots showing feature spread across classes.

In [None]:
desc = df.describe().T
print(desc[['mean', 'std', 'min', '50%', 'max']])

## 2. Class Distribution

In [None]:
plt.figure(figsize=(8, 5))
if 'remarks' in df.columns:
    ax = sns.countplot(x='remarks', data=df, hue='remarks', palette='viridis', legend=False)
    plt.title('Distribution of Student Remarks')
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')
    plt.show()

## 1.b.i & 1.b.ii Correlation Coefficients (Input/Output & Inter-input)

In [None]:
plt.figure(figsize=(10, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Ranked Correlation with Target
plt.figure(figsize=(8, 5))
corr['remarks'].drop('remarks').sort_values().plot(kind='barh', color='teal')
plt.title('Correlation with Remarks (Target)')
plt.xlabel('Correlation Coefficient')
plt.show()

## 4. Feature Distributions by Class (Boxplots & Violin Plots)\nVisualizing how each input differentiates between classes.

In [None]:
inputs = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']
plt.figure(figsize=(15, 20))
plot_idx = 1

for col in inputs:
    if col in df.columns:
        # Boxplot
        plt.subplot(7, 2, plot_idx)
        sns.boxplot(x='remarks', y=col, data=df, hue='remarks', palette='Set2', legend=False)
        plt.title(f'{col} Boxplot by Class')
        plot_idx += 1
        
        # Violin Plot
        plt.subplot(7, 2, plot_idx)
        sns.violinplot(x='remarks', y=col, data=df, hue='remarks', palette='Set2', legend=False)
        plt.title(f'{col} Distribution by Class')
        plot_idx += 1

plt.tight_layout()
plt.show()

## 5. Pairplot Analysis

In [None]:
if 'remarks' in df.columns:
    sample_df = df.sample(min(1000, len(df)), random_state=42)
    vars_to_plot = [c for c in ['x1', 'x2', 'x3', 'x7'] if c in df.columns]
    sns.pairplot(sample_df, hue='remarks', vars=vars_to_plot, palette='viridis')
    plt.show()