In [1]:
pip install pandas numpy scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA


In [3]:
df = pd.read_csv(r'C:\Users\user\Desktop\pi\WHI_Inflation.csv', na_values=["", " ", "  ", "   ", "\t", "\n", "\r", "\xa0"])
print("Initial Data Shape:", df.shape)
print(df.head())

Initial Data Shape: (1232, 16)
       Country  Year  Headline Consumer Price Inflation  \
0  Afghanistan  2015                             -0.660   
1  Afghanistan  2016                              4.380   
2  Afghanistan  2017                              4.976   
3  Afghanistan  2018                              0.630   
4  Afghanistan  2019                              2.302   

   Energy Consumer Price Inflation  Food Consumer Price Inflation  \
0                        -4.250000                      -0.840000   
1                         2.070000                       5.670000   
2                         4.440000                       6.940000   
3                         1.474185                      -1.045952   
4                        -2.494359                       3.794770   

   Official Core Consumer Price Inflation  Producer Price Inflation  \
0                                0.219999                       NaN   
1                                5.192760                

In [4]:
num_duplicates = df.duplicated().sum()
print(f"\n🔍 Number of duplicate rows: {num_duplicates}")


🔍 Number of duplicate rows: 0


In [5]:
print("\n🔍 Missing values per column:")
print(df.isnull().sum())


🔍 Missing values per column:
Country                                     0
Year                                        0
Headline Consumer Price Inflation          32
Energy Consumer Price Inflation           142
Food Consumer Price Inflation             102
Official Core Consumer Price Inflation    498
Producer Price Inflation                  463
GDP deflator Index growth rate             21
Continent/Region                            0
Score                                       0
GDP per Capita                              0
Social support                              0
Healthy life expectancy at birth            0
Freedom to make life choices                0
Generosity                                  0
Perceptions of corruption                   1
dtype: int64


In [6]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [7]:
## 2.Dropping any remaining NaNs
df.dropna(inplace=True)
print("\n🔍 Missing values per column after cleaning:")
print(df.isnull().sum())
print(f"✅ Missing values handled. New shape: {df.shape}")


🔍 Missing values per column after cleaning:
Country                                   0
Year                                      0
Headline Consumer Price Inflation         0
Energy Consumer Price Inflation           0
Food Consumer Price Inflation             0
Official Core Consumer Price Inflation    0
Producer Price Inflation                  0
GDP deflator Index growth rate            0
Continent/Region                          0
Score                                     0
GDP per Capita                            0
Social support                            0
Healthy life expectancy at birth          0
Freedom to make life choices              0
Generosity                                0
Perceptions of corruption                 0
dtype: int64
✅ Missing values handled. New shape: (1232, 16)


In [8]:
df

Unnamed: 0,Country,Year,Headline Consumer Price Inflation,Energy Consumer Price Inflation,Food Consumer Price Inflation,Official Core Consumer Price Inflation,Producer Price Inflation,GDP deflator Index growth rate,Continent/Region,Score,GDP per Capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption
0,Afghanistan,2015,-0.660000,-4.250000,-0.840000,0.219999,5.841887,2.665090,South Asia,3.5750,0.319820,0.302850,0.303350,0.234140,0.365100,0.097190
1,Afghanistan,2016,4.380000,2.070000,5.670000,5.192760,5.841887,-2.409509,South Asia,3.3600,0.382270,0.110370,0.173440,0.164300,0.312680,0.071120
2,Afghanistan,2017,4.976000,4.440000,6.940000,5.423228,5.841887,2.404000,South Asia,3.7940,0.401477,0.581543,0.180747,0.106180,0.311871,0.061158
3,Afghanistan,2018,0.630000,1.474185,-1.045952,-0.126033,5.841887,2.071208,South Asia,3.6320,0.332000,0.537000,0.255000,0.085000,0.191000,0.036000
4,Afghanistan,2019,2.302000,-2.494359,3.794770,3.512612,5.841887,6.520928,South Asia,3.2030,0.350000,0.517000,0.361000,0.000000,0.158000,0.025000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227,Zimbabwe,2019,255.292007,90.808071,86.124371,3.512612,5.841887,-4.035235,Sub-Saharan Africa,3.6630,0.366000,1.114000,0.433000,0.361000,0.151000,0.089000
1228,Zimbabwe,2020,557.210000,306.431673,601.020236,3.512612,5.841887,568.971862,Sub-Saharan Africa,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929
1229,Zimbabwe,2021,98.546000,69.820000,105.800000,3.512612,5.841887,113.294981,Sub-Saharan Africa,3.1450,0.457000,0.649000,0.243000,0.359000,0.157000,0.075000
1230,Zimbabwe,2022,104.705171,97.246550,149.967034,3.512612,5.841887,113.018434,Sub-Saharan Africa,2.9950,0.947000,0.690000,0.270000,0.329000,0.106000,0.105000


In [9]:
original_columns_to_keep = ['Country', 'Year', 'Score']
original_data = df[original_columns_to_keep].copy()

In [10]:
# 3. Encoding categorical data using OneHotEncoder
categorical_columns = ['Country', 'Continent/Region']
print("\n🔠 Encoding categorical columns using OneHotEncoder:")

# Saving original values for preview
original_country = df['Country'].copy()
original_region = df['Continent/Region'].copy()

# Apply OneHotEncoder (compatible with older scikit-learn versions)
ohe = OneHotEncoder(sparse_output=False, dtype=int)  # Use sparse=False for dense output
encoded_array = ohe.fit_transform(df[categorical_columns])

# Create column names for the new one-hot columns
encoded_columns = ohe.get_feature_names_out(categorical_columns)

# Create a DataFrame from the encoded array
encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns)

# Drop original categorical columns and concatenate the encoded columns
df = df.drop(columns=categorical_columns)
df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)

print("✅ One-hot encoding complete.")
print(f"🔍 New columns added: {list(encoded_columns)}")

# === Preview One-Hot Encoding transformation ===
transformation_preview = pd.concat(
    [original_country, original_region, encoded_df],  # Using the original variables for preview
    axis=1
).drop_duplicates().head(10)

print("\n🔄 One-hot encoding transformation preview:")
print(transformation_preview)


🔠 Encoding categorical columns using OneHotEncoder:
✅ One-hot encoding complete.
🔍 New columns added: ['Country_Afghanistan', 'Country_Albania', 'Country_Algeria', 'Country_Angola', 'Country_Argentina', 'Country_Armenia', 'Country_Australia', 'Country_Austria', 'Country_Azerbaijan', 'Country_Bahrain', 'Country_Bangladesh', 'Country_Belarus', 'Country_Belgium', 'Country_Belize', 'Country_Benin', 'Country_Bhutan', 'Country_Bolivia', 'Country_Bosnia and Herzegovina', 'Country_Botswana', 'Country_Brazil', 'Country_Bulgaria', 'Country_Burkina Faso', 'Country_Burundi', 'Country_Cambodia', 'Country_Cameroon', 'Country_Canada', 'Country_Central African Republic', 'Country_Chad', 'Country_Chile', 'Country_China', 'Country_Colombia', 'Country_Comoros', 'Country_Costa Rica', 'Country_Croatia', 'Country_Cyprus', 'Country_Czech Republic', 'Country_Denmark', 'Country_Djibouti', 'Country_Dominican Republic', 'Country_Ecuador', 'Country_El Salvador', 'Country_Estonia', 'Country_Ethiopia', 'Country_Fi

In [11]:
# 4. Scaling (StandardScaler)
print("\n📏 Applying Standard Scaling to numerical data...")
df_encoded = df.copy()

# Encode non-numeric columns
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Save original data for comparison
original_values = df_encoded.copy()

# Apply StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_encoded)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

print("✅ Scaling complete. Example row:\n", scaled_df.iloc[0])

# === Preview Scaling transformation ===
scaling_preview = pd.DataFrame({
    'Original': original_values.iloc[0],
    'Scaled': scaled_df.iloc[0]
})
print("\n🔄 Scaling transformation preview:")
print(scaling_preview)


📏 Applying Standard Scaling to numerical data...
✅ Scaling complete. Example row:
 Year                                     -1.525433
Headline Consumer Price Inflation        -0.324469
Energy Consumer Price Inflation          -0.684293
Food Consumer Price Inflation            -0.354062
Official Core Consumer Price Inflation   -0.774201
                                            ...   
Continent/Region_North America and ANZ   -0.173494
Continent/Region_South Asia               4.381406
Continent/Region_Southeast Asia          -0.249136
Continent/Region_Sub-Saharan Africa      -0.574851
Continent/Region_Western Europe          -0.416333
Name: 0, Length: 172, dtype: float64

🔄 Scaling transformation preview:
                                           Original    Scaled
Year                                    2015.000000 -1.525433
Headline Consumer Price Inflation         -0.660000 -0.324469
Energy Consumer Price Inflation           -4.250000 -0.684293
Food Consumer Price Inflation      

In [12]:
# 5. Normalization
print("\n🧮 Applying Normalization...")

# Applying Normalizer to all data (not just scaled data)
normalizer = Normalizer()
normalized_data = normalizer.fit_transform(df)  # Apply to entire dataframe
normalized_df = pd.DataFrame(normalized_data, columns=df.columns)  # Ensure same column names

print("✅ Normalization complete. Example row:\n", normalized_df.iloc[0])

# === Preview Normalization transformation ===
normalization_preview = pd.DataFrame({
    'Original': df.iloc[0],  # Original data (first row)
    'Normalized': normalized_df.iloc[0]  # Normalized data (first row)
})
print("\n🔄 Normalization transformation preview:")
print(normalization_preview)



🧮 Applying Normalization...
✅ Normalization complete. Example row:
 Year                                      0.999991
Headline Consumer Price Inflation        -0.000328
Energy Consumer Price Inflation          -0.002109
Food Consumer Price Inflation            -0.000417
Official Core Consumer Price Inflation    0.000109
                                            ...   
Continent/Region_North America and ANZ    0.000000
Continent/Region_South Asia               0.000496
Continent/Region_Southeast Asia           0.000000
Continent/Region_Sub-Saharan Africa       0.000000
Continent/Region_Western Europe           0.000000
Name: 0, Length: 172, dtype: float64

🔄 Normalization transformation preview:
                                           Original  Normalized
Year                                    2015.000000    0.999991
Headline Consumer Price Inflation         -0.660000   -0.000328
Energy Consumer Price Inflation           -4.250000   -0.002109
Food Consumer Price Inflation       

In [13]:
# 6. Principal Component Analysis (Dimensionality Reduction)
print("\n🔻 Applying PCA (2 components)...")

# Apply PCA to the normalized data
pca = PCA(n_components=2)
pca_result = pca.fit_transform(normalized_df)
pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])

# Preview PCA transformation
pca_preview = pd.DataFrame({
    'Normalized': normalized_df.iloc[0],
    'PCA PC1': pca_df.iloc[0, 0],
    'PCA PC2': pca_df.iloc[0, 1]
})
print("\n🔄 PCA transformation preview:")
print(pca_preview)

# Concatenate with saved original columns
final_df = pd.concat([pca_df, original_data.reset_index(drop=True)], axis=1)

print("\n📌 Final dataset with PCA and selected labels:")
print(final_df.head())


🔻 Applying PCA (2 components)...

🔄 PCA transformation preview:
                                        Normalized   PCA PC1  PCA PC2
Year                                      0.999991 -0.007046  0.00294
Headline Consumer Price Inflation        -0.000328 -0.007046  0.00294
Energy Consumer Price Inflation          -0.002109 -0.007046  0.00294
Food Consumer Price Inflation            -0.000417 -0.007046  0.00294
Official Core Consumer Price Inflation    0.000109 -0.007046  0.00294
...                                            ...       ...      ...
Continent/Region_North America and ANZ    0.000000 -0.007046  0.00294
Continent/Region_South Asia               0.000496 -0.007046  0.00294
Continent/Region_Southeast Asia           0.000000 -0.007046  0.00294
Continent/Region_Sub-Saharan Africa       0.000000 -0.007046  0.00294
Continent/Region_Western Europe           0.000000 -0.007046  0.00294

[172 rows x 3 columns]

📌 Final dataset with PCA and selected labels:
        PC1       PC2   

In [24]:
print(final_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232 entries, 0 to 1231
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PC1      1232 non-null   float64
 1   PC2      1232 non-null   float64
 2   Country  1232 non-null   object 
 3   Year     1232 non-null   int64  
 4   Score    1232 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 48.3+ KB
None


In [26]:
# === Step 3: Define Features & Target ===
target_column = 'Score'

if target_column not in final_df.columns:
    raise ValueError(f"Target column '{target_column}' not found in final_df. Available columns: {final_df.columns}")

# Keep only numerical columns for X (excluding the target and any non-numeric columns)
X = df.select_dtypes(include=['number']).drop(columns=[target_column], errors='ignore')
y = df[target_column]

# === Step 4: Train-Test Split ===
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n✅ Data successfully split!")
print(f"Training shape: {X_train.shape}, Testing shape: {X_test.shape}")

# === Step 5: Train & Evaluate Multiple Regression Models ===
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0)
}

print("\n📊 Model Performance Evaluation:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name}:")
    print(f"- MSE: {mse:.4f}")
    print(f"- RMSE: {rmse:.4f}")
    print(f"- R²: {r2:.4f}")



✅ Data successfully split!
Training shape: (985, 171), Testing shape: (247, 171)

📊 Model Performance Evaluation:

Random Forest Regressor:
- MSE: 0.1421
- RMSE: 0.3769
- R²: 0.8766

Decision Tree Regressor:
- MSE: 0.2703
- RMSE: 0.5199
- R²: 0.7652

Linear Regression:
- MSE: 0.0983
- RMSE: 0.3136
- R²: 0.9146

Ridge Regression:
- MSE: 0.0986
- RMSE: 0.3140
- R²: 0.9143




In [27]:
# === Step 6: Hyperparameter Optimization for Linear Regression ===
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Ensures coefficients stay positive (can help in specific cases)
}

print("\n🔧 Performing hyperparameter tuning for Linear Regression...")
grid_search = GridSearchCV(
    LinearRegression(),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"\n🏆 Best Hyperparameters for Linear Regression: {best_params}")

# === Step 7: Train Optimized Linear Regression Model ===
optimized_model = LinearRegression(**best_params)
optimized_model.fit(X_train, y_train)
y_pred_optimized = optimized_model.predict(X_test)

mse_opt = mean_squared_error(y_test, y_pred_optimized)
rmse_opt = mean_squared_error(y_test, y_pred_optimized, squared=False)
r2_opt = r2_score(y_test, y_pred_optimized)

print("\n🚀 Optimized Linear Regression Performance:")
print(f"- MSE: {mse_opt:.4f}")
print(f"- RMSE: {rmse_opt:.4f}")
print(f"- R²: {r2_opt:.4f}")



🔧 Performing hyperparameter tuning for Linear Regression...

🏆 Best Hyperparameters for Linear Regression: {'fit_intercept': False, 'positive': True}

🚀 Optimized Linear Regression Performance:
- MSE: 0.1065
- RMSE: 0.3264
- R²: 0.9075




In [28]:
print("""
Interpretation:
Optimized Linear Regression performed slightly worse after hyperparameter tuning.
Best parameters found: {'fit_intercept': False, 'positive': True}.

- fit_intercept=False: Forces the model to go through the origin. This can negatively impact performance if it's not appropriate for the data.
- positive=True: Forces all coefficients to be positive, limiting the model's flexibility.

The default Linear Regression (with intercept & both positive/negative coefficients) gave better results.
This suggests that in this case, removing the intercept and restricting the coefficients to be positive led to a slight reduction in accuracy.
""")



Interpretation:
Optimized Linear Regression performed slightly worse after hyperparameter tuning.
Best parameters found: {'fit_intercept': False, 'positive': True}.

- fit_intercept=False: Forces the model to go through the origin. This can negatively impact performance if it's not appropriate for the data.
- positive=True: Forces all coefficients to be positive, limiting the model's flexibility.

The default Linear Regression (with intercept & both positive/negative coefficients) gave better results.
This suggests that in this case, removing the intercept and restricting the coefficients to be positive led to a slight reduction in accuracy.

