In [None]:
import pandas as pd
import numpy as np

data_file_path = '/content/World-happiness-report-2024.csv'
df = pd.read_csv(data_file_path)

print("Размер: ", len(df.columns), "\n")
print(df.head())

Размер:  12 

  Country name            Regional indicator  Ladder score  upperwhisker  \
0      Finland                Western Europe         7.741         7.815   
1      Denmark                Western Europe         7.583         7.665   
2      Iceland                Western Europe         7.525         7.618   
3       Sweden                Western Europe         7.344         7.422   
4       Israel  Middle East and North Africa         7.341         7.405   

   lowerwhisker  Log GDP per capita  Social support  Healthy life expectancy  \
0         7.667               1.844           1.572                    0.695   
1         7.500               1.908           1.520                    0.699   
2         7.433               1.881           1.617                    0.718   
3         7.267               1.878           1.501                    0.724   
4         7.277               1.803           1.513                    0.740   

   Freedom to make life choices  Generosity  Per

In [None]:
print(df.describe())

       Ladder score  upperwhisker  lowerwhisker  Log GDP per capita  \
count    143.000000    143.000000    143.000000          140.000000   
mean       5.527580      5.641175      5.413972            1.378807   
std        1.170717      1.155008      1.187133            0.425098   
min        1.721000      1.775000      1.667000            0.000000   
25%        4.726000      4.845500      4.606000            1.077750   
50%        5.785000      5.895000      5.674000            1.431500   
75%        6.416000      6.507500      6.319000            1.741500   
max        7.741000      7.815000      7.667000            2.141000   

       Social support  Healthy life expectancy  Freedom to make life choices  \
count      140.000000               140.000000                    140.000000   
mean         1.134329                 0.520886                      0.620621   
std          0.333317                 0.164923                      0.162492   
min          0.000000                 0.

In [None]:
print(df.columns)

Index(['Country name', 'Regional indicator', 'Ladder score', 'upperwhisker',
       'lowerwhisker', 'Log GDP per capita', 'Social support',
       'Healthy life expectancy', 'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Dystopia + residual'],
      dtype='object')


In [None]:
features = ['Log GDP per capita', 'Social support',
            'Healthy life expectancy', 'Freedom to make life choices']

print(len(df))
df = df.dropna(subset=features)
print(len(df))

y = df['Ladder score']
X = df[features]

143
140


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.67)

tree_split = DecisionTreeRegressor(random_state=1)
tree_split.fit(train_X, train_y)

predicted_happy = tree_split.predict(val_X)

MAE_split = mean_absolute_error(val_y, predicted_happy)
print("MAE split:", f'{MAE_split:.4f}')
results = {"tr-test split" : MAE_split}

MAE split: 0.5705


In [None]:
mae_results = {}

for depth in range(2, 10):
    model = DecisionTreeRegressor(max_depth=depth,
                                  random_state=1)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)

    mae = mean_absolute_error(val_y, y_pred)
    mae_results[depth] = mae
    print(depth, mae)

best_depth = min(mae_results, key=mae_results.get)
MAE_depth = mae_results[best_depth]

print("MAE depth:", f' {MAE_depth:.4f}',
      "\nГлубина дерева:", best_depth)

results["best_depth"] = MAE_depth

2 0.511816552395276
3 0.5425333044752054
4 0.5084087851041873
5 0.5494074468085104
6 0.5296055386693684
7 0.5602258865248227
8 0.5649539007092198
9 0.546156914893617
MAE depth:  0.5084 
Глубина дерева: 4


In [None]:
mae_results = {}

for leaf_nodes in range(2, 15):
    model = DecisionTreeRegressor(max_leaf_nodes=leaf_nodes,
                                  random_state=1)
    model.fit(train_X, train_y)
    y_pred = model.predict(val_X)

    mae = mean_absolute_error(val_y, y_pred)
    mae_results[leaf_nodes] = mae


best_leaf = min(mae_results, key=mae_results.get)
MAE_leaf = mae_results[best_leaf]

print("MAE leaf: ", f'{MAE_leaf:.4f}',
      "\nОптимальное max_leaf_nodes: ", best_leaf)

results["best_leaf"] = MAE_leaf

MAE leaf:  0.4983 
Оптимальное max_leaf_nodes:  3


In [None]:
model = DecisionTreeRegressor(max_depth=best_depth,
                              max_leaf_nodes=best_leaf,
                              random_state=1)
model.fit(train_X, train_y)
y_pred = model.predict(val_X)
MAE_depth_leaf = mean_absolute_error(val_y, y_pred)
print("MAE при оптим парам:", f'{MAE_depth_leaf:.4f}')
results["depth leaf"] = MAE_depth_leaf

MAE при оптим парам: 0.4983


In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
forest_preds = forest_model.predict(val_X)

MAE_RandomForest = mean_absolute_error(val_y, forest_preds)
print("MAE RandomForest:", f'{MAE_RandomForest:.4f}')
results["RandomForest"] = MAE_RandomForest

MAE RandomForest: 0.4644


In [None]:
forest_params = RandomForestRegressor(max_leaf_nodes=best_leaf,
                                      max_depth=best_depth,
                                      random_state=1)
forest_params.fit(train_X, train_y)
y_pred = forest_params.predict(val_X)

MAE_RandomForest_params = mean_absolute_error(val_y, y_pred)
print("MAE RandomForest с параметрами:", f'{MAE_RandomForest_params:.4f}')
results["RandomForest2"] = MAE_RandomForest_params

MAE RandomForest с параметрами: 0.4519


In [None]:
for key, value in results.items():
  print(key, f'\t{value:.4f}')

tr-test split 	0.5705
best_depth 	0.5084
best_leaf 	0.4983
depth leaf 	0.4983
RandomForest 	0.4644
RandomForest2 	0.4519
