In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df["female"] = (df["gender"] == "female").astype(int)
df = df.drop("gender", axis=1)
df.head()

In [None]:
df.info()

In [None]:
df["lunch"] = (df["lunch"] == "standard").astype(int)

In [None]:
df.info()

In [None]:
df["test preparation course"] = (df["test preparation course"] == "completed").astype(int)

In [None]:
edu_order = {
    "some high school": 0,
    "high school": 1,
    "some college": 2,
    "associate's degree": 3,
    "bachelor's degree": 4,
    "master's degree": 5
}
df["parental level of education"] = df["parental level of education"].map(edu_order)


In [None]:
df = pd.get_dummies(df, columns=["race/ethnicity"], drop_first=True)
df[df.select_dtypes("bool").columns] = df.select_dtypes("bool").astype(int)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns

plt.figure(figsize=(15, 6))
sns.boxplot(data=df[numeric_columns], palette="flare")
plt.xticks(rotation=45)
plt.title("Boxplot of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = df[numeric_columns].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True,cmap="flare", fmt=".2f", linewidths=0.5, linecolor="white")
plt.title("Correlation Heatmap of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
sns.pairplot(df[numeric_cols], corner=True, diag_kind="hist", plot_kws={"color": "#ff69b4"})
plt.suptitle("Pairplot of Numerical Features", y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x="test preparation course", y="math score", data=df, palette="flare")
plt.title("Math Score by Test Preparation Course")
plt.tight_layout()
plt.show()

In [None]:
pink_palette = ["#ffb6c1", "#ff69b4", "#ffc0cb", "#f4a7b9", "#f7c6d9"]
plt.figure(figsize=(6, 4))
sns.violinplot(x="lunch", y="math score", data=df, palette=pink_palette)
plt.title("Math Score Distribution by Lunch Type")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.regplot(x="reading score", y="writing score", data=df, scatter_kws={"color": "#ff69b4"}, line_kws={"color": "#c71585"})
plt.title("Reading vs Writing Scores")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x="math score", y="writing score", data=df, color="#ff69b4", alpha=0.7)
plt.title("Math vs Writing Scores")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x="lunch", hue="female", data=df, palette=pink_palette)
plt.title("Lunch Types by Gender")
plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
plt.figure(figsize=(6, 4))
sns.histplot(df["math score"], bins=20, kde=True, color="#ff69b4")
plt.title("Distribution of Math Scores")
plt.tight_layout()
plt.show()


In [None]:
y = df["math score"]
X = df.drop("math score", axis=1)

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("math score", axis=1)
y = df["math score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

In [None]:
def correlation_for_dropping(df, threshold):
    columns_to_drop = set()
    numeric_df = df.select_dtypes(include=["int64", "float64"])  # 👈 sadece sayısal verilerle çalış
    corr = numeric_df.corr()
    for i in range(len(corr.columns)):
        for j in range(i):
            if abs(corr.iloc[i, j]) > threshold:
                columns_to_drop.add(corr.columns[i])
    return columns_to_drop     

In [None]:
cols_to_drop = correlation_for_dropping(X, threshold=0.90)
X = X.drop(columns=cols_to_drop)

In [None]:
cols_to_drop

In [None]:
columns_dropping = correlation_for_dropping(X_train, threshold=0.90)

X_train.drop(columns_dropping, axis=1, inplace=True)
X_test.drop(columns_dropping, axis=1, inplace=True)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

linear = LinearRegression()
linear.fit(X_train_scaled, y_train)
y_pred = linear.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")  # Doğru tahmin çizgisi
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

lasso = Lasso() 
lasso.fit(X_train_scaled, y_train)
y_pred = lasso.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Lasso Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Ridge Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

elastic = ElasticNet()
elastic.fit(X_train_scaled, y_train)
y_pred = elastic.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Elastic Net Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

lasso_cv = LassoCV(cv=5, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)
y_pred = lasso_cv.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Best Alpha:", lasso_cv.alpha_)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("LassoCV Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ridge_cv = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)
ridge_cv.fit(X_train_scaled, y_train)
y_pred = ridge_cv.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Best Alpha:", ridge_cv.alpha_)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("LassoCV Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

elastic_cv = ElasticNetCV(alphas=[0.01, 0.1, 1.0, 10.0], l1_ratio=[0.1, 0.5, 0.9], cv=5)
elastic_cv.fit(X_train_scaled, y_train)
y_pred = elastic_cv.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)

print("Best Alpha:", elastic_cv.alpha_)
print("Best L1 Ratio:", elastic_cv.l1_ratio_)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", score)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color="orchid", s=60)
sns.lineplot(x=y_test, y=y_test, color="black", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("LassoCV Regression - Actual vs Predicted")
plt.tight_layout()
plt.show()

In [None]:
linear_mse = mean_squared_error(y_test, linear.predict(X_test_scaled))
linear_r2 = r2_score(y_test, linear.predict(X_test_scaled))

lasso_mse = mean_squared_error(y_test, lasso_cv.predict(X_test_scaled))
lasso_r2 = r2_score(y_test, lasso_cv.predict(X_test_scaled))

ridge_mse = mean_squared_error(y_test, ridge_cv.predict(X_test_scaled))
ridge_r2 = r2_score(y_test, ridge_cv.predict(X_test_scaled))

elastic_mse = mean_squared_error(y_test, elastic_cv.predict(X_test_scaled))
elastic_r2 = r2_score(y_test, elastic_cv.predict(X_test_scaled))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

models = ["Linear", "LassoCV", "RidgeCV", "ElasticNetCV"]
mse_scores = [linear_mse, lasso_mse, ridge_mse, elastic_mse]
r2_scores = [linear_r2, lasso_r2, ridge_r2, elastic_r2]
colors = ["orchid", "hotpink", "plum", "violet"]

plt.figure(figsize=(12, 5))

# MSE Barplot
plt.subplot(1, 2, 1)
sns.barplot(x=models, y=mse_scores, palette=colors)
plt.title("MSE Comparison")
plt.ylabel("Mean Squared Error")

# R² Score Barplot
plt.subplot(1, 2, 2)
sns.barplot(x=models, y=r2_scores, palette=colors)
plt.title("R² Score Comparison")
plt.ylabel("R² Score")

plt.tight_layout()
plt.show()