In [None]:
#importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, LeaveOneOut, cross_val_predict

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import mean_squared_error

# Settings
sns.set(style="whitegrid")


In [None]:
df = pd.read_csv('complete_data.csv')
df_orig_pisa = df['PISA_Total_Average']

# one hot encoding for income level classification, updated from recit 10
df_income = pd.get_dummies(df['classification'], prefix='classification', drop_first = True)

df = pd.concat([df, df_income], axis=1) # add encoded variables to original data frame

# one hot encoding for world reagions
df_region = pd.get_dummies(df['World_Region'], prefix='World_Region', drop_first = True)

df = pd.concat([df, df_region], axis=1)

df_entity = pd.get_dummies(df['Entity'], prefix='Entity', drop_first = True)

df = pd.concat([df, df_entity], axis=1)

#df.to_csv("complete_data_encoded.csv")

# catagorical variables to exclude from standardization
exclude = list(df_income) + list(df_region) + list(df_entity)
print(exclude)

# unused columns for ML are dropped for clarity

#df now has all features and the target column, including encoded categorical variables
df = df.drop(columns=[
    "Entity",
    #"Year",
    "classification",
    "World_Region",
    "Population",

    "Math_PISA_Female",
    "Math_PISA_Male",
    "Math_PISA_Total",

    "Read_PISA_Female",
    "Read_PISA_Male",
    "Read_PISA_Total",

    "Sci_PISA_Female",
    "Sci_PISA_Male",
    "Sci_PISA_Total" ,


    'PISA_Girls_Average',
    'PISA_Boys_Average',

    'Mean_School_Years_M',
    'Mean_School_Years_F',
    'Unnamed: 0'])

#pisa_before_std= df[['PISA_Total_Average']]

# creating a list of numerical columns to standardize, excluding encoded categorical variables
numerical_columns = []
for column in df.columns:
    if column not in exclude:
      numerical_columns.append(column)

print(numerical_columns) # to check the column names

scaler = StandardScaler()

# only numerical columns are scaled
scaled_values = scaler.fit_transform(df[numerical_columns])

# partial data frame of scaled values with same column names
df_scaled = pd.DataFrame(scaled_values, columns=numerical_columns, index=df.index)

# scaled and encoded values are combined back, df is now rewritten with standardized values
df = pd.concat([ df_scaled, df_income, df_region , df_entity], axis=1)


df = df * 1 # used to convert boolean to integer

#df.head()
#df.describe()



X = df.drop(columns = 'PISA_Total_Average') # all features are put into a new dataframe X
y = df['PISA_Total_Average'] # target value PISA_Total_Average is put into new dataframe y
# seperate dataframes to be used in ML models is now prepared

print(y.head())

X.to_csv("complete_data_encodedxx.csv")

X.head()

['classification_Low-income countries', 'classification_Lower-middle-income countries', 'classification_Upper-middle-income countries', 'World_Region_Asia', 'World_Region_Europe', 'World_Region_North America', 'World_Region_Oceania', 'World_Region_South America', 'Entity_Algeria', 'Entity_Argentina', 'Entity_Australia', 'Entity_Austria', 'Entity_Belarus', 'Entity_Belgium', 'Entity_Bosnia and Herzegovina', 'Entity_Brazil', 'Entity_Brunei', 'Entity_Bulgaria', 'Entity_Cambodia', 'Entity_Canada', 'Entity_Chile', 'Entity_Colombia', 'Entity_Costa Rica', 'Entity_Croatia', 'Entity_Cyprus', 'Entity_Czechia', 'Entity_Denmark', 'Entity_Dominican Republic', 'Entity_El Salvador', 'Entity_Estonia', 'Entity_Finland', 'Entity_France', 'Entity_Georgia', 'Entity_Germany', 'Entity_Greece', 'Entity_Guatemala', 'Entity_Hong Kong', 'Entity_Hungary', 'Entity_Iceland', 'Entity_Indonesia', 'Entity_Ireland', 'Entity_Israel', 'Entity_Italy', 'Entity_Jamaica', 'Entity_Japan', 'Entity_Jordan', 'Entity_Kazakhstan',

Unnamed: 0,Year,EduSpend_GDP,GDP_PPP_perCapita,Gini_Coefficient,Gender_Inequality,GovEduSpend_Pct,Avg_School_Years,Expected_School_Years,HDI,Corruption_Index,...,Entity_Trinidad and Tobago,Entity_Tunisia,Entity_Turkey,Entity_Ukraine,Entity_United Arab Emirates,Entity_United Kingdom,Entity_United States,Entity_Uruguay,Entity_Uzbekistan,Entity_Vietnam
0,-1.502837,-0.856356,-1.068267,-0.763326,0.184493,-0.607354,-0.611713,-1.73498,-1.292986,1.340299,...,0,0,0,0,0,0,0,0,0,0
1,-1.078506,-1.078962,-1.016115,-0.595039,0.078031,-0.607354,-0.526086,-1.441734,-1.056468,1.357391,...,0,0,0,0,0,0,0,0,0,0
2,-0.654176,-1.038373,-0.957632,-0.731048,-0.040955,-0.757809,-0.442518,-0.929942,-0.778817,1.278768,...,0,0,0,0,0,0,0,0,0,0
3,-0.229846,-0.832733,-0.885472,-0.824459,-0.448015,-0.665533,-0.311499,-0.579946,-0.532016,1.394993,...,0,0,0,0,0,0,0,0,0,0
4,0.194485,-0.948702,-0.847682,-0.932624,-0.416702,-0.543714,-0.064737,0.464697,-0.151531,1.425759,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score



xgb = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42
)

# 5-fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)

neg_mse_scores  = cross_val_score(xgb, X, y,
                                  scoring='neg_mean_squared_error',
                                  cv=kf,
                                  n_jobs=-1)
neg_rmse_scores = cross_val_score(xgb, X, y,
                                  scoring='neg_root_mean_squared_error',
                                  cv=kf,
                                  n_jobs=-1)

mse_scores  = -neg_mse_scores
rmse_scores = -neg_rmse_scores

print("MSE per fold: ", np.round(mse_scores, 4))
print("Mean MSE:     ", np.round(mse_scores.mean(), 4))
print()
print("RMSE per fold:", np.round(rmse_scores, 4))
print("Mean RMSE:    ", np.round(rmse_scores.mean(), 4))


MSE per fold:  [0.0613 0.0622 0.0846 0.0875 0.1277]
Mean MSE:      0.0847

RMSE per fold: [0.2476 0.2494 0.2908 0.2958 0.3573]
Mean RMSE:     0.2882


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 300, 400,500],
    'max_depth':    [2, 3, 5, 6, 7],
    'learning_rate':[0.01, 0.1, 0.3, 0.35],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=XGBRegressor(objective='reg:squarederror', random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose= 1 # to print out how many candidates and fits
)

grid.fit(X, y)

print("Best params:", grid.best_params_)
mse = -grid.best_score_
rmse = mse**0.5
print(f"CV MSE (best): {mse:.4f}")
print(f"CV RMSE (best): {rmse:.4f}")



Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params: {'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 500}
CV MSE (best): 0.0578
CV RMSE (best): 0.2405


* Scores are decreased after trying different hyperparameters.