In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv('Final_Dataset.csv')

df.head(10)


Unnamed: 0.1,Unnamed: 0,Gender,Area,Living Situation,Higher Secondary Educaion,Higher Secondary Subjects,Higher Secondary Marks Percentage (%),NET Marks,Parents Education Level,Family Income,...,Study Consistency Level,Participation in Extra-Curricular Activities,Available Emotional Support,Sleeping Level,Attendance Level,Access to Resources,Understanding of Subject,Interest in Degree,SGPA,Personal Space (Sqft)
0,0,Male,Urban,Day Scholar,FSC,Pre Engineering,93,165,4,300000.0,...,4,3,5,4,5,5,4,5,3.7,1875.0
1,1,Male,Urban,Hostelite,FSC,Pre Engineering,95,145,5,200000.0,...,4,4,4,5,4,4,4,4,3.4,1500.0
2,2,Male,Urban,Hostelite,FSC,Computer Science,93,143,5,300000.0,...,2,3,3,3,5,5,2,3,3.3,350.0
3,3,Male,Urban,Hostelite,A Levels,Pre Engineering,88,150,5,300000.0,...,1,3,3,3,4,5,5,5,3.4,437.5
4,4,Male,Urban,Hostelite,FSC,Pre Engineering,89,143,5,200000.0,...,3,5,5,1,5,5,1,5,3.3,175.0
5,5,Female,Urban,Hostelite,FSC,Pre Medicine,90,144,4,100000.0,...,2,3,4,3,5,5,5,4,2.7,250.0
6,6,Male,Urban,Day Scholar,A Levels,Pre Engineering,94,145,4,300000.0,...,2,2,5,3,4,5,3,4,3.8,350.0
7,7,Female,Urban,Hostelite,FSC,Pre Engineering,92,140,4,200000.0,...,5,4,4,4,5,4,3,3,3.1,437.5
8,8,Male,Urban,Hostelite,FSC,Pre Engineering,85,150,4,200000.0,...,2,3,4,3,4,3,2,2,2.8,625.0
9,9,Male,Urban,Hostelite,FSC,Pre Engineering,89,143,3,100000.0,...,4,4,4,4,4,2,2,2,3.3,350.0


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Final_Dataset.csv")

# Drop unnecessary index column if present
df.drop(columns=["Unnamed: 0"], inplace=True)

# List of categorical columns to one-hot encode
categorical_columns = [
    "Gender",
    "Area",
    "Living Situation",
    "Higher Secondary Educaion",
    "Higher Secondary Subjects"
]

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Optional: Save the result to a new CSV file


In [None]:
# Define feature columns and target variable
features = [
    'Gender',
    'Area',
    'Living Situation',
    'Higher Secondary Educaion',
    'Higher Secondary Subjects',
    'Higher Secondary Marks Percentage (%)',
    'NET Marks',
    'Parents Education Level',
    'Family Income',
    'Personal Space (Sqft)',  # ← Updated
    'Family Size',
    'Stress Level',
    # 'Self-Confidence Level',  # ← Removed since it's not present
    'Study Consistency Level',
    'Participation in Extra-Curricular Activities',
    'Available Emotional Support',
    'Sleeping Level',
    'Attendance Level',
    'Access to Resources',
    'Understanding of Subject',
    'Interest in Degree'
]


target = 'SGPA'


# Drop rows with missing target values
df = df.dropna(subset=[target])

# Split data into features (X) and target (y)
X = df[features]
y = df[target].astype(float)  # Ensure target is float

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64']).columns.tolist()

# Preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Elastic Net Regression': ElasticNet(alpha=0.1, l1_ratio=0.5),
    'Support Vector Regression': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),
    'Decision Tree Regression': DecisionTreeRegressor(random_state=42),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Evaluate each model
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    mse = evaluate_model(pipeline, X_train, X_test, y_train, y_test)
    print(f'{name} Mean Squared Error: {mse:.4f}')


Linear Regression Mean Squared Error: 0.3113
Ridge Regression Mean Squared Error: 0.3147
Lasso Regression Mean Squared Error: 0.2711
Elastic Net Regression Mean Squared Error: 0.2859
Support Vector Regression Mean Squared Error: 0.6662
Decision Tree Regression Mean Squared Error: 0.5042
Random Forest Regression Mean Squared Error: 0.3342


Best Performing Model: Lasso Regression
Lasso Regression takes the lead with the lowest Mean Squared Error (MSE) of 0.2711. By performing feature selection, Lasso eliminates less relevant features, effectively reducing noise and improving the predictive accuracy of the model. Its ability to handle high-dimensional data while performing feature selection makes it an ideal choice for this dataset.

Elastic Net Regression's Strong Performance
Elastic Net Regression follows closely with an MSE of 0.2859. This model combines both Lasso (L1) and Ridge (L2) penalties, balancing feature selection and regularization. It is particularly effective when dealing with correlated features, as it helps reduce their influence on the model, ensuring improved generalization and robustness.

Random Forest's Moderate Performance
Random Forest Regression, with an MSE of 0.3342, performs well but slightly lags behind Lasso and Elastic Net. Its ensemble nature helps in reducing variance, and it captures complex interactions between features. However, it is still not as accurate as Lasso and Elastic Net in this case.

Linear and Ridge Regression
Linear Regression (MSE: 0.3113) and Ridge Regression (MSE: 0.3147) both show moderate performance. While Ridge helps in reducing overfitting by applying an L2 penalty, it doesn't perform as well as Lasso and Elastic Net, particularly in terms of feature selection.

Decision Tree Regression
Decision Tree Regression has an MSE of 0.5042, showing that it struggles with overfitting on the dataset. Although it can capture complex patterns, it tends to fit noise in smaller datasets, leading to higher errors.

Support Vector Regression's Underperformance
Support Vector Regression (MSE: 0.6662) has the highest error among the models, indicating that it may not generalize well for this particular dataset. Its performance suggests that fine-tuning hyperparameters might be necessary for better results, but in its current form, it struggles with overfitting and underfitting.