In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from itertools import combinations
from tqdm import tqdm
from time import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


In [2]:
X_train = pd.read_csv('X_train.csv', index_col=0).sort_index()
y_train = pd.read_csv('y_train.csv', index_col=0).sort_index()


In [3]:
print(f"Total number of individuals : {len(X_train)}.")
years = X_train['Year'].unique()
for year in years:
  count_year = len(X_train[X_train['Year']==year])
  print(f"Pour l'année {year}, il y a {count_year} individus.")

Total number of individuals : 1172086.
Pour l'année 2022, il y a 412647 individus.
Pour l'année 2018, il y a 411320 individus.
Pour l'année 2015, il y a 348119 individus.


# Choosing which columns to keep

In [5]:
X_train.shape

(1172086, 306)

In [6]:
count_null = X_train.isna().sum()
kept_cols = list(count_null[count_null < 0.65 * len(X_train)].index)

exclude_substrings = ['_total_timing', '_average_score', 'CNTRYID', 'CNTSTUID', 'BOOKID']

final_cols = [col for col in kept_cols if not any(substr in col for substr in exclude_substrings)]

X_train = X_train[final_cols]


In [7]:
#The different types of columns 
categorical1 = ['Year', 'CNT', 'CYC', 'STRATUM', 'OECD', 'ADMINMODE', 'ST004D01T', 'IMMIG', 'CNTSCHID']
categorical_cols = []
binary_cols = []
numeric_cols = []
categorical_threshold = 5

for col in X_train.columns:
    if len(set(X_train[col].dropna().unique()))==2:
        binary_cols.append(col)
    elif col in categorical1:
        categorical_cols.append(col)
    elif X_train[col].dtype == 'object':
        categorical_cols.append(col)
    elif pd.api.types.is_numeric_dtype(X_train[col]):
        numeric_cols.append(col)

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Binary columns (0/1): {len(binary_cols)}")
print(f"Real/numeric columns: {len(numeric_cols)}")


Categorical columns: 7
Binary columns (0/1): 9
Real/numeric columns: 161


In [8]:
for col in categorical_cols:
    print(f"For {col}: {len(X_train[col].unique())} unique values and {len(X_train[X_train[col].isna()])} missing values.")

For Year: 3 unique values and 0 missing values.
For CNT: 98 unique values and 0 missing values.
For CNTSCHID: 28078 unique values and 0 missing values.
For CYC: 3 unique values and 0 missing values.
For STRATUM: 2731 unique values and 0 missing values.
For OECD: 3 unique values and 0 missing values.
For IMMIG: 4 unique values and 461218 missing values.


# Imputing missing data

In [10]:
X_train.isna().sum()

Year             0
CNT              0
CNTSCHID         0
CYC              0
NatCen           0
             ...  
ST213       760766
ST327       760766
ST251       760766
ST260       760766
PA160       760766
Length: 177, dtype: int64

In [11]:
# Numerical columns
imputer = SimpleImputer(strategy='median')  # or 'mean'
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])

# Categorical or binary columns
imputer_cat = SimpleImputer(strategy='most_frequent')  
X_train[binary_cols] = imputer_cat.fit_transform(X_train[binary_cols])
X_train[categorical_cols] = imputer_cat.fit_transform(X_train[categorical_cols])


In [12]:
X_train.isna().sum()

Year        0
CNT         0
CNTSCHID    0
CYC         0
NatCen      0
           ..
ST213       0
ST327       0
ST251       0
ST260       0
PA160       0
Length: 177, dtype: int64

# Scaling and One hot encoding

In [14]:
# Standard Scaling of the numerical variable 
do_standard_scaler = False 

if do_standard_scaler:
    scaler = StandardScaler()
    scaler.fit(X_train[numeric_cols])
    X_train_final = scaler.transform(X_train[numeric_cols])
    X_train_final = pd.DataFrame(X_train_final, columns=numeric_cols, index=X_train.index)
else:
    X_train_final = X_train[numeric_cols].copy()

# One hot encoding of the binary
for cat_variable in binary_cols:
    dummies = pd.get_dummies(X_train[cat_variable], prefix=cat_variable)
    X_train_final = pd.concat([X_train_final, dummies], axis=1)

#One hot encoding of some of the categorical
for cat_variable in ['Year', 'CNT', 'CYC', 'OECD', 'IMMIG']:
    dummies = pd.get_dummies(X_train[cat_variable], prefix=cat_variable)
    X_train_final = pd.concat([X_train_final, dummies], axis=1)
    if cat_variable in X_train_final.columns:
        X_train_final = X_train_final.drop(columns=[cat_variable])


In [22]:
X_train_final, X_test_final, y_train, y_test = train_test_split(X_train_final, y_train, test_size=0.5, random_state=42)


# Remove the highly correlated features

In [25]:
corr_matrix = X_train_final.corr()

mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

high_corr = corr_matrix.where(mask).stack().sort_values(ascending=False)
high_corr = high_corr[high_corr > 0.9]

to_drop = set()
for f1, f2 in high_corr.index:
    if f2 not in to_drop:
        to_drop.add(f2)

print(f"Dropping {len(to_drop)} variables.")

X_train_final = X_train_final.drop(columns=list(to_drop))
X_test_final = X_test_final.drop(columns=list(to_drop))

Dropping 0 variables.


## Regressions 

In [None]:
models = {
    #"Linear Regression": LinearRegression(),
    #"Ridge Regression": Ridge(alpha=1.0),
    #"Lasso Regression": Lasso(alpha=0.01),
    "Gradient Boosting 1": XGBRegressor(tree_method="hist", max_depth=6, n_estimators=200),
    "Gradient Boosting 2": GradientBoostingRegressor(n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.7,
    min_samples_leaf=5),
    "Random Forest": RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=5,
    max_features='sqrt'
),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "Elastic Net": ElasticNet(alpha=0.01, l1_ratio=0.5),
    "Decision Tree": DecisionTreeRegressor(max_depth=None),
    "Support Vector Regressor": SVR(kernel="rbf", C=1.0, epsilon=0.1)
}


In [None]:
results = {}
non_na_cols = X_train_final.columns[X_train_final.notna().all()].tolist()

for name, model in models.items():
    beginning = time()
    print(name)
    X_fit = X_train_final[non_na_cols]
    print("X shape:", X_fit.shape, "y shape:", y_train.shape)
    if name in ["Linear Regression", "Ridge Regression", "Lasso Regression"]:
        model.fit(X_fit, y_train)
    else: 
        model.fit(X_fit, y_train.values.ravel())
        #print("X shape:", X_train.shape, "y shape:", y_train.shape)
        #model.fit(X_train_final, y_train)
        #y_pred = model.predict(X_train)
    X_train_fit = X_train_final[non_na_cols]
    X_test_fit  = X_test_final[non_na_cols]

    y_pred_train = model.predict(X_train_fit)
    y_pred_test = model.predict(X_test_fit)

    # Train metrics
    mse_train = mean_squared_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)

    # Test metrics
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    print("Train R2:", r2_train, "Train MSE:", mse_train)
    print("Test  R2:", r2_test, "Test  MSE:", mse_test)

    results[name] = {"Train MSE": mse_train,
                    "Train R2": r2_train,
                    "Test MSE": mse_test,
                    "Test R2": r2_test}
    print(time() - beginning)


In [None]:
y_train.describe()