In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the correct file path, such as "mental-health.csv"
file_path = "Mental_Health_and_Social_Media_Balance_Dataset.csv"  # Adjust this based on the actual file name in the dataset

# Load the latest version
df = kagglehub.dataset_load(  # Use dataset_load() instead of load_dataset()
  KaggleDatasetAdapter.PANDAS,
  "prince7489/mental-health-and-social-media-balance-dataset",
  file_path,
)

# Show the first 5 records
print("First 5 records: \n", df.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df = pd.get_dummies(df,columns=['Social_Media_Platform'],drop_first=True)

x = df.drop(columns=['User_ID','Happiness_Index(1-10)'])
y = df['Happiness_Index(1-10)']

In [None]:
print(df.head())

In [None]:
x

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
print('x_train,x_test,y_train,y_test :',len(x_train),len(x_test),len(y_train),len(y_test))

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

random_forest = RandomForestRegressor()
random_forest.fit(x_train_scaled,y_train)

In [None]:
y_pred = random_forest.predict(x_test_scaled)

In [None]:
from sklearn.metrics import mean_squared_error , r2_score
mse = mean_squared_error(y_test,y_pred)
print(f'mse is : {mse}')

In [None]:
# Since happiness index is 1-10 
# the mse range can be around 80+
# a mse score of 0.8 means a roungh error of 0.9 point which is not that bad
# tho it can be improved

In [None]:
# df['Gender'] = label_encoder.fit_transform(df['Gender'])
# Works fine since "Gender" has few categories.
# But LabelEncoder assigns arbitrary numbers (e.g., Male=1, Female=0, Other=2) → can accidentally introduce false “order” relationships.

# Better:
# Use one-hot encoding for Gender as well:

df = kagglehub.dataset_load(  # Use dataset_load() instead of load_dataset()
  KaggleDatasetAdapter.PANDAS,
  "prince7489/mental-health-and-social-media-balance-dataset",
  file_path,
)

label_encoder = LabelEncoder()
df = pd.get_dummies(df,columns=['Social_Media_Platform','Gender'],drop_first=True)

x = df.drop(columns=['User_ID','Happiness_Index(1-10)'])
y = df['Happiness_Index(1-10)']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Random Forest Does not care about scaling so no need to scale

random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)


# mse
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")





In [None]:
# Hyperparameter tuning (most important)
# By default, RandomForestRegressor() uses small tree depth and few estimators — not optimal.
# Use RandomizedSearchCV to find better settings:

from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

search = RandomizedSearchCV(rf, param_distributions=param_dist,
                             n_iter=25, cv=5, scoring='neg_mean_squared_error',
                             n_jobs=-1, random_state=42)
search.fit(x_train, y_train)

best_rf = search.best_estimator_
y_pred = best_rf.predict(x_test)

# mse
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
## These parameters actually increased the error , so we will stick to the traditional RandomForest 
## without any hyperparameter tuning

## Learnt something new
## What are the parameters used here 
# n_estimators : Number of trees in the forest.
# max_depth : Maximum depth (levels) of each decision tree ; Controls how “deep” a tree can go before stopping.
# min_samples_split : Minimum number of samples needed to split a node.
# min_samples_leaf : Minimum number of samples required in a leaf (final node).

# max_features :How many features to consider when looking for the best split 
 # -->Controls the randomness between trees.
 # --> Fewer features = more diversity among trees → better generalization.
 # --> Common options:
 # --> 'sqrt' → use √(total features) per split (default for regression)
 # --> 'log2' → use log₂(total features)