1. Scale numerical features

2. Retrain regression and classification models

3. observe changes in model performance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# GDP data of country
dataset = pd.read_csv("/content/2020-2025.csv")
dataset.head()

Unnamed: 0,Country,2020,2021,2022,2023,2024,2025
0,Afghanistan,20136,14278.0,14501.0,17248.0,,
1,Albania,15271,18086.0,19185.0,23388.0,27259.0,28372.0
2,Algeria,164774,185850.0,225709.0,247789.0,264913.0,268885.0
3,Andorra,2885,3325.0,3376.0,3786.0,4038.0,4035.0
4,Angola,66521,84375.0,142442.0,109764.0,115946.0,113343.0


In [None]:
cleaned_dataset = dataset.dropna(subset=['2025'])
print(f"Original dataset shape: {dataset.shape}")
print(f"Cleaned dataset shape after dropping rows with missing '2025': {cleaned_dataset.shape}")

Original dataset shape: (196, 7)
Cleaned dataset shape after dropping rows with missing '2025': (189, 7)



Now, we need to handle any remaining missing values in the feature columns (2020, 2021, 2022, 2023, 2024) of the `cleaned_dataset` before defining features and target. I will first check how many missing values exist in these columns.



In [None]:
feature_columns = ['2020', '2021', '2022', '2023', '2024']
missing_values_features = cleaned_dataset[feature_columns].isnull().sum()
print("Missing values in feature columns:")
print(missing_values_features)

Missing values in feature columns:
2020    0
2021    0
2022    0
2023    0
2024    0
dtype: int64


In [None]:
cleaned_dataset.head()

Unnamed: 0,Country,2020,2021,2022,2023,2024,2025
1,Albania,15271,18086.0,19185.0,23388.0,27259.0,28372.0
2,Algeria,164774,185850.0,225709.0,247789.0,264913.0,268885.0
3,Andorra,2885,3325.0,3376.0,3786.0,4038.0,4035.0
4,Angola,66521,84375.0,142442.0,109764.0,115946.0,113343.0
5,Antigua and Barbuda,1412,1602.0,1867.0,2006.0,2225.0,2373.0


In [None]:
cleaned_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 1 to 195
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  189 non-null    object 
 1   2020     189 non-null    int64  
 2   2021     189 non-null    float64
 3   2022     189 non-null    float64
 4   2023     189 non-null    float64
 5   2024     189 non-null    float64
 6   2025     189 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 11.8+ KB


In [None]:
X = cleaned_dataset[feature_columns]
y = cleaned_dataset['2025']

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (189, 5)
Shape of y: (189,)


In [None]:
X

Unnamed: 0,2020,2021,2022,2023,2024
1,15271,18086.0,19185.0,23388.0,27259.0
2,164774,185850.0,225709.0,247789.0,264913.0
3,2885,3325.0,3376.0,3786.0,4038.0
4,66521,84375.0,142442.0,109764.0,115946.0
5,1412,1602.0,1867.0,2006.0,2225.0
...,...,...,...,...,...
190,42838,56615.0,89013.0,102377.0,119808.0
191,346310,370076.0,411068.0,433008.0,459472.0
193,20220,19394.0,23534.0,19412.0,19101.0
194,18138,22096.0,29164.0,27578.0,26326.0


## Split Data into Training and Test Sets

### Subtask:
Divide the preprocessed data into training and testing sets to evaluate the model's performance on unseen data.


**Reasoning**:
To divide the data into training and testing sets, I need to import the `train_test_split` function from `sklearn.model_selection`.



In [None]:
from sklearn.model_selection import train_test_split

print("Imported train_test_split from sklearn.model_selection")

Imported train_test_split from sklearn.model_selection


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (151, 5)
Shape of X_test: (38, 5)
Shape of y_train: (151,)
Shape of y_test: (38,)


In [None]:
from sklearn.preprocessing import StandardScaler

print("Imported StandardScaler from sklearn.preprocessing")

Imported StandardScaler from sklearn.preprocessing


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully.")
print(f"Shape of scaled X_train: {X_train_scaled.shape}")
print(f"Shape of scaled X_test: {X_test_scaled.shape}")

Features scaled successfully.
Shape of scaled X_train: (151, 5)
Shape of scaled X_test: (38, 5)


In [None]:
from sklearn.linear_model import LinearRegression

print("Imported LinearRegression from sklearn.linear_model")

Imported LinearRegression from sklearn.linear_model


In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Linear Regression model trained successfully.")

Linear Regression model trained successfully.


In [None]:
from sklearn.metrics import r2_score, mean_squared_error

print("Imported r2_score and mean_squared_error from sklearn.metrics")

Imported r2_score and mean_squared_error from sklearn.metrics


In [None]:
y_pred = model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.2f}")

R-squared score: 0.9985
Mean Squared Error: 236853877.17
