# MSCS 634 Project Deliverable 3

## Objective
The purpose of this deliverable is to build classification and clustering models, apply association rule mining, and perform hyperparameter tuning to improve model performance.
Here is the URL for the dataset: https://www.kaggle.com/datasets/anassarfraz13/student-success-factors-and-insights

# Re-using the Data Collection and Cleaning logic from Project Deliverable-1

This below logic:

 * Loads and cleans the dataset by handling missing values (mode/median), removing duplicates, and checking categorical data for inconsistencies.

 * Analyzes numerical columns to detect outliers using the IQR method and reports how many rows are affected.

 * Prepares the dataset for further analysis or modeling by ensuring data quality and identifying potential issues.

 * Even though outliers exist, we keep them because values like exam scores and hours studied are valid and useful for understanding differences between high and low performance.

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

df = pd.read_csv('StudentPerformanceFactors.csv')
print(df.head)
df.info()

df.describe()

#Hnadling missing values
print("Missing values in each column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")


# Fixing the missing values
print("Before handling missing values:")
print(f"Shape: {df.shape}")

# Check which columns have missing values
cols_with_missing = df.columns[df.isnull().any()].tolist()
print(f"\nColumns with missing values: {cols_with_missing}")

# For categorical columns with missing values, filling with mode
# For numerical columns with missing values, filling with median
for col in cols_with_missing:
    if df[col].dtype in ['object']:
        mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} with mode: {mode_value}")
    else:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
        print(f"Filled {col} with median: {median_value}")

print(f"\nAfter handling missing values:")
print(f"Shape: {df.shape}")
print(f"Remaining missing values: {df.isnull().sum().sum()}")

#Removing duplicates
print("Before removing duplicates:")
print(f"Shape: {df.shape}")

duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    df = df.drop_duplicates()
    print("\nAfter removing duplicates:")
    print(f"Shape: {df.shape}")
    print(f"Rows removed: {duplicate_count}")
else:
    print("\nNo duplicates found in the dataset. Therefore not removing any date")


# Checking for inconsistent data in categorical columns
print("Unique values in categorical columns:\n")
categorical_cols = df.select_dtypes(include=['object']).columns

# Check for any inconsistencies in categorical data
inconsistencies_found = False
print("Checking for inconsistent categorical values...")
for col in categorical_cols:
    unique_vals = df[col].unique()
    # Check for leading/trailing spaces
    stripped_vals = [str(val).strip() if pd.notna(val) else val for val in unique_vals]
    original_vals = [str(val) if pd.notna(val) else val for val in unique_vals]
    
    if stripped_vals != original_vals:
        print(f"  WARNING: {col} has values with whitespace issues!")
        inconsistencies_found = True
    print(f"{col}: {unique_vals}\n")

if not inconsistencies_found:
    print("No inconsistencies found in categorical columns (no extra whitespace, all consistent)\n")


# Check for outliers using IQR method for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Outlier detection using IQR method:\n")

# Create mask with DataFrame index to ensure alignment
outlier_mask = pd.Series(False, index=df.index)

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}:")
    print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"  Number of outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
    if len(outliers) > 0:
        print(f"  Outlier range: [{outliers[col].min():.2f}, {outliers[col].max():.2f}]")
        # Add to outlier mask
        outlier_mask |= (df[col] < lower_bound) | (df[col] > upper_bound)
    print()

# Summary of outliers found
total_outliers = outlier_mask.sum()
print(f"Total rows with at least one outlier: {total_outliers}")
print(f"Percentage of data with outliers: {total_outliers/len(df)*100:.2f}%")

# It doesnot make sense to remove most of the outliers in above case.
# This is because the rows like exam_scores and hours studied are all in valid range.
# It makes sense to keep those scores as they will help understand the factors responsible for excellent exam scores
# vs the poorest exam performances.


#We might have to rethink whether to remove the outliers or not based the analysis we need to perform for future deliverables.
#For now we will create another df_cleaned with the outliers removed
# df_cleaned = df[~outlier_mask].reset_index(drop=True)

# print(f"\nAfter outlier removal:")
# print(f"  Rows before: {len(df)}")
# # print(f"  Rows after: {len(df_cleaned)}")
# print(f"  Removed: {total_outliers} rows ({total_outliers/len(df)*100:.2f}%)")

# df.info()

    

<bound method NDFrame.head of       Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0                23          84                  Low                High   
1                19          64                  Low              Medium   
2                24          98               Medium              Medium   
3                29          89                  Low              Medium   
4                19          92               Medium              Medium   
...             ...         ...                  ...                 ...   
6602             25          69                 High              Medium   
6603             23          76                 High              Medium   
6604             20          90               Medium                 Low   
6605             10          86                 High                High   
6606             15          67               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Sc