In [1]:
# Multiple Linear Regression - Step 1: Import Necessary Libraries

import pandas as pd  # For handling datasets
import matplotlib.pyplot as plt  # For visualization
from sklearn.model_selection import train_test_split  # For splitting data into train/test sets
from sklearn.linear_model import LinearRegression  # For fitting the regression model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # For model evaluation

# Placeholder: Load your dataset (Update the file path when needed)
# df = pd.read_csv("path_to_your_dataset.csv")

# Display the first few rows of the dataset (Uncomment after loading the dataset)
# print(df.head())

In [2]:
# Step 1: Load the dataset and define features (X) and label (y)

# Load the dataset from the specified path
df = pd.read_csv('C:/Users/dbda.STUDENTSDC/Music/LabPractice/Notebooks/Datasets/happyscore_income.csv')

# Display the first row to verify the dataset structure
print(df.head(1))  

   country  adjusted_satisfaction  avg_satisfaction  std_satisfaction  \
0  Armenia                   37.0               4.9              2.42   

   avg_income  median_income  income_inequality                        region  \
0     2096.76    1731.506667          31.445556  'Central and Eastern Europe'   

   happyScore      GDP country.1  
0        4.35  0.76821   Armenia  


In [3]:
# Step 2: Define features (X) and label (y) for multiple linear regression

# Select predictor variables (independent features)
X = df[['adjusted_satisfaction', 'avg_income', 'median_income', 'income_inequality']]  

# Define target variable (dependent variable)
y = df['happyScore']  

# Print feature and label samples to verify selection
print("Feature variables:\n", X.head())
print("\nTarget variable:\n", y.head())

Feature variables:
    adjusted_satisfaction  avg_income  median_income  income_inequality
0                   37.0     2096.76    1731.506667          31.445556
1                   26.0     1448.88    1044.240000          42.720000
2                   60.0     7101.12    5109.400000          45.475556
3                   59.0    19457.04   16879.620000          30.296250
4                   65.0    19917.00   15846.060000          35.285000

Target variable:
 0    4.350
1    4.033
2    6.574
3    7.200
4    7.284
Name: happyScore, dtype: float64


In [4]:
# Step 2: Split the data into training and testing sets

# Set aside 20% of the data for testing while keeping results consistent with random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Print dataset sizes for verification
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 88 samples
Testing set size: 23 samples


In [5]:
# Step 3: Fit Multiple Linear Regression to Training Data

# Initialize the linear regression model
model = LinearRegression()

# Train (fit) the model using the training dataset
model.fit(X_train, y_train)

# Print model coefficients and intercept to interpret feature importance
print("Model Coefficients:", model.coef_)  # Importance of each feature in predicting happyScore
print("Model Intercept:", model.intercept_)  # Baseline value when all predictors are zero

Model Coefficients: [ 0.06388931  0.00029153 -0.0002913  -0.02361837]
Model Intercept: 2.830949004700013


In [6]:
# Step 4: Make Predictions on the Test Set

# Predict the happyScore values using the trained model
y_pred = model.predict(X_test)

# Print predicted values for reference
print("Predicted Happy Score:\n", y_pred)

Predicted Happy Score:
 [5.20189105 3.68000755 7.34074365 5.6065443  4.89762814 4.39199852
 5.65704152 6.44224327 4.45134432 3.17970002 6.45096768 5.41473858
 4.55903262 5.3793141  6.04159228 6.52080114 5.57365029 3.99271385
 4.02334895 5.06458743 5.40722016 6.70751152 4.02392112]


In [7]:
# Compare actual vs. predicted values on the test set

print("--- Actual vs. Predicted (Test Set) ---")

# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({'Actual Final values': y_test, 'Predicted Final values': y_pred})

# Display the first 10 comparisons
print(comparison_df.head(10))  
print("-------------------------------------------------------------------------------------------\n")

--- Actual vs. Predicted (Test Set) ---
    Actual Final values  Predicted Final values
78                4.514                5.201891
10                2.905                3.680008
4                 7.284                7.340744
84                5.102                5.606544
64                5.889                4.897628
68                4.436                4.391999
30                5.429                5.657042
45                7.278                6.442243
96                3.667                4.451344
11                3.340                3.179700
-------------------------------------------------------------------------------------------



In [8]:
# Step 5: Model Evaluation
# Calculate Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R2 Score)

mae = mean_absolute_error(y_test, y_pred)  # Measures average absolute deviation of predictions
mse = mean_squared_error(y_test, y_pred)  # Measures squared error magnitude
r2 = r2_score(y_test, y_pred)  # Determines model's goodness of fit

# Print evaluation results for interpretation
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2 Score):", r2)

Mean Absolute Error (MAE): 0.506678863312916
Mean Squared Error (MSE): 0.3511336640317997
R-squared (R2 Score): 0.780102201136714


In [9]:
# Step 6: Make new predictions with custom input

print("--- Making New Predictions ---")

# Create a DataFrame for new input values, ensuring column names match training features
new_data_for_prediction = pd.DataFrame({
    'adjusted_satisfaction': [35, 42, 61],  # Example values for satisfaction
    'avg_income': [5000, 2500, 3400],  # Example average income
    'median_income': [3800, 2500, 5500],  # Example median income
    'income_inequality': [32, 43, 51]  # Example income inequality
})

# Predict happy scores for new input values using the trained model
new_y_pred = model.predict(new_data_for_prediction)

# Print the predicted happy scores
print("\nPredicted Happy Scores for New Input Values:\n", new_y_pred)

# Step 6: Make new predictions and display results

# Predict happiness scores for the new data
new_predictions = model.predict(new_data_for_prediction)

# Print the new data for reference
print("\nNew Data for Prediction:\n", new_data_for_prediction)

# Print the predicted happiness scores
print("\nPredicted Final Happiness Score:\n", new_predictions)

print("-------------------------------------------------------\n")

--- Making New Predictions ---

Predicted Happy Scores for New Input Values:
 [4.66197652 4.49927466 4.91269696]

New Data for Prediction:
    adjusted_satisfaction  avg_income  median_income  income_inequality
0                     35        5000           3800                 32
1                     42        2500           2500                 43
2                     61        3400           5500                 51

Predicted Final Happiness Score:
 [4.66197652 4.49927466 4.91269696]
-------------------------------------------------------

