In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/student-exam-percentages/student_exam_percentage.csv')
df.head()


In [None]:
df.describe().T

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.title("Variable Distribution and Outlier Checking")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Set the visual style for the plots
sns.set(style="whitegrid")

# Create a figure to display histograms of all numerical columns
plt.figure(figsize=(15, 10))

# Iterate through each column to plot its distribution
for i, column in enumerate(df.columns):
    plt.subplot(3, 2, i + 1)
    # kde=True adds a kernel density estimate line to see the shape of the distribution
    sns.histplot(df[column], kde=True, color='skyblue')
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

# Visualize the correlations using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Analysis of Student Performance Features")
plt.show()

# Focus specifically on how other features relate to the Final Percentage
print("--- Correlation with Final Percentage ---")
print(correlation_matrix["Final_Percentage"].sort_values(ascending=False))

In [None]:
# Create a correlation matrix excluding the target variable 'Final_Percentage'
# I only want to see how independent variables relate to each other
independent_variables = df.drop(columns=['Final_Percentage'])
independent_corr = independent_variables.corr()
independent_corr

In [None]:
# Visualize only the relationships between features
plt.figure(figsize=(8, 6))
sns.heatmap(independent_corr, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title("Correlation Between Independent Variables (Check for Multicollinearity)")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Define Features (X) and Target (y)
# X contains the independent variables, y contains the value I want to predict
X = df.drop(columns=['Final_Percentage'])
y = df['Final_Percentage']
# Split the data: 80% for training the model, 20% for testing its performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# 1. Initialize the StandardScaler
scaler = StandardScaler()
# 2. Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
# 3. Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)
# Convert them back to DataFrame just to see how they look
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)

In [None]:
X_train_scaled

In [None]:
X_train_scaled_df

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# 1. Initialize the Linear Regression model
regressor = LinearRegression()
# 2. Train the model using the scaled training data
regressor.fit(X_train_scaled, y_train)
# 3. View the results of the learning process
print("Model training on scaled data is complete.")
print(f"Intercept (b0): {regressor.intercept_:.4f}")

# Creating a Series to see which coefficient belongs to which feature
coefficients = pd.Series(regressor.coef_, index=X.columns)
print("\n--- Model Coefficients (Importance) ---")
print(coefficients.sort_values(ascending=False))

In [None]:
# Use the trained model to make predictions on the test set
# These are the scores the model "thinks" the students got
y_pred = regressor.predict(X_test_scaled)

# I compared the first 5 predictions with the actual results
comparison_df = pd.DataFrame({'Actual': y_test.values[:5], 'Predicted': y_pred[:5]})

print("--- Actual vs Predicted Values (First 5) ---")
print(comparison_df)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 1. Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
# 2. Calculate Root Mean Squared Error (Standard deviation of the errors)
rmse = np.sqrt(mse)
# 3. Calculate R-squared Score
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score (R2): {r2:.4f}")


In [None]:
# Create a plotting area
plt.figure(figsize=(8, 6))

# Scatter plot: Actual values on X-axis, Predicted values on Y-axis
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, color='blue')

# sns.regplot draws a scatter plot and fits a regression line
sns.regplot(x=y_test, y=y_pred, scatter_kws={'alpha':0.5, 'color':'blue'}, line_kws={'color':'red', 'lw':2})

plt.xlabel('Actual Final Percentage')
plt.ylabel('Predicted Final Percentage')
plt.title('Actual vs Predicted - Linear Regression')
plt.show()

In [None]:
#Ridge Regression (L2 Regularization)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# 1. Initialize Ridge model
# alpha is the penalty strength (default is 1.0)
ridge_model = Ridge(alpha=1.0)
# 2. Train the model with scaled data
ridge_model.fit(X_train_scaled, y_train)

In [None]:
# 3. Make predictions
y_pred_ridge = ridge_model.predict(X_test_scaled)

In [None]:
# 4. Evaluation
ridge_r2 = r2_score(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

print(f"Ridge Regression R2 Score: {ridge_r2:.4f}")
print(f"Ridge Regression RMSE: {ridge_rmse:.2f}")

In [None]:
# Extract the intercept
ridge_intercept = ridge_model.intercept_

# Extract the coefficients and pair them with feature names
ridge_coefficients = pd.Series(ridge_model.coef_, index=X.columns)

print(f"Ridge Intercept (b0): {ridge_intercept:.4f}")
print("\n--- Ridge Coefficients ---")
print(ridge_coefficients.sort_values(ascending=False))

In [None]:
from sklearn.linear_model import RidgeCV
# 1. Define a list of alpha candidates
# I test from very small to large values
alphas = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 50.0]
# 2. Initialize RidgeCV
ridge_cv_model = RidgeCV(alphas=alphas, cv=5)
# 3. Fit the model to the scaled training data
ridge_cv_model.fit(X_train_scaled, y_train)

In [None]:
# 4. Results
print(f"The best alpha found by RidgeCV: {ridge_cv_model.alpha_}")
print(f"R2 Score with best alpha: {ridge_cv_model.score(X_test_scaled, y_test):.4f}")

In [None]:
# Extract coefficients for the best model
ridge_cv_coeffs = pd.Series(ridge_cv_model.coef_, index=X.columns)
print("\n--- Coefficients of the Best Ridge Model ---")
print(ridge_cv_coeffs.sort_values(ascending=False))

In [None]:
#Lasso (Least Absolute Shrinkage and Selection Operator) L1 Regularization

In [None]:
from sklearn.linear_model import Lasso

In [None]:
# 1. Initialize Lasso with a specific alpha
manual_lasso = Lasso(alpha=0.5)
# 2. Train the model
manual_lasso.fit(X_train_scaled, y_train)
# 3. Check the results
lasso_score = manual_lasso.score(X_test_scaled, y_test)
lasso_coefs = pd.Series(manual_lasso.coef_, index=X.columns)

print(f"Lasso (alpha=0.5) R2 Score: {lasso_score:.4f}")
print("\n--- Lasso Coefficients ---")
print(lasso_coefs)

In [None]:
from sklearn.linear_model import LassoCV

# 1. Initialize LassoCV
# Sometimes Lasso needs more iterations to converge
lasso_cv = LassoCV(alphas=None, cv=5, max_iter=10000)

# 2. Fit the model to scaled data
lasso_cv.fit(X_train_scaled, y_train)

# 3. Best alpha and score
print(f"Optimal Alpha found: {lasso_cv.alpha_:.6f}")
print(f"LassoCV R2 Score: {lasso_cv.score(X_test_scaled, y_test):.4f}")

# 4. Final Coefficients
lasso_cv_results = pd.Series(lasso_cv.coef_, index=X.columns)
print("\n--- Final LassoCV Coefficients ---")
print(lasso_cv_results.sort_values(ascending=False))

In [None]:
#ElasticNet L1 (Lasso)+L2 (Ridge)

In [None]:
from sklearn.linear_model import ElasticNetCV

# 1. Initialize ElasticNetCV
# l1_ratio: I provided a list of possibilities from Ridge-heavy (0.1) to Lasso-heavy (0.9)
en_cv_model = ElasticNetCV( l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                           alphas=None, 
                           cv=5, 
                           max_iter=10000)

# 2. Fit the model
en_cv_model.fit(X_train_scaled, y_train)

# 3. Best Parameters
print(f"Optimal L1 Ratio: {en_cv_model.l1_ratio_}")
print(f"Optimal Alpha: {en_cv_model.alpha_:.6f}")
print(f"ElasticNetCV R2 Score: {en_cv_model.score(X_test_scaled, y_test):.4f}")

# 4. Final Coefficients
en_coeffs = pd.Series(en_cv_model.coef_, index=X.columns)
print("\n--- Final ElasticNet Coefficients ---")
print(en_coeffs.sort_values(ascending=False))