In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
from copy import deepcopy
 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy import stats
from copy import deepcopy

from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split

PATH_DATA = '/kaggle/input/ef-msu-2024-comp-1/'

In [2]:
def RMSLE(y_true, y_pred):
    """
    The Root Mean Squared Log Error (RMSLE) metric

    :param y_true: The ground truth labels given in the dataset
    :param y_pred: Our predictions

    :return: The RMSLE score
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [3]:
train = pd.read_csv(PATH_DATA + "train.csv")
test = pd.read_csv(PATH_DATA + "test.csv")
test_ids = test["id"]

In [4]:
train_y = np.log1p(train["charges"])
train.drop(["charges"], axis=1, inplace=True)

In [5]:
train = pd.concat([train, pd.get_dummies(train[["sex" , "smoker", "region"]])], axis=1).drop(["sex", "smoker" , "id","region"], axis=1 )

test = pd.concat(
    [test, pd.get_dummies(test[["sex", "smoker", "region"]])], axis=1).drop(["sex", "smoker", "id","region"], axis=1)

In [6]:
train.head(10)

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,43,26.03,0,False,True,True,False,True,False,False,False
1,58,28.215,0,True,False,True,False,False,True,False,False
2,53,31.35,0,False,True,True,False,False,False,True,False
3,54,29.2,1,False,True,True,False,False,False,False,True
4,19,34.9,0,False,True,False,True,False,False,False,True
5,39,22.8,3,True,False,True,False,True,False,False,False
6,51,25.4,0,False,True,True,False,False,False,False,True
7,25,29.7,3,False,True,False,True,False,False,False,True
8,62,37.4,0,False,True,True,False,False,False,False,True
9,23,33.4,0,True,False,True,False,False,False,False,True


In [7]:
numerical_columns = ['age', 'bmi' ] #identifying the numerical columns to apply z-score # Applying z-score transformation to numerical columns
 
train[numerical_columns] = train[numerical_columns].apply(sp.stats.zscore)
test[numerical_columns] = test[numerical_columns].apply(sp.stats.zscore)

In [8]:
test.head(10)

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.410356,-0.937179,2,True,False,True,False,True,False,False,False
1,-0.243282,-0.151337,0,True,False,True,False,False,True,False,False
2,1.790259,-0.659823,0,True,False,False,True,False,True,False,False
3,0.482982,-0.844727,3,False,True,True,False,False,True,False,False
4,-1.477932,0.156836,0,False,True,False,True,False,True,False,False
5,-0.388535,1.937753,1,False,True,True,False,False,False,False,True
6,-1.477932,-1.368622,0,True,False,True,False,False,True,False,False
7,1.790259,1.127582,0,False,True,True,False,False,True,False,False
8,-0.824294,-2.216098,0,True,False,True,False,True,False,False,False
9,0.700862,-0.367059,3,False,True,True,False,False,True,False,False


In [9]:
def cv_and_predict(
    df_train,
    df_test,
    train_y,
    model,
    n_splits=5,
    random_state=422,
    metric=RMSLE
):
    """
    Функция для кросс-валидации и предикта на тест

    :param df_train: Трейн-датафрейм
    :param df_test: Тест-датафрейм
    :param train_y: Ответы на трейн
    :param model: Модель, которую мы хотим учить
    :param n_splits: Количество сплитов для KFold
    :param random_state: random_state для KFold

    :return: pred_test: Предсказания на тест; oof_df: OOF предсказания
    """

  
    kf =  KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
 
 

    # В датафрейме oof_df будут храниться настоящий таргет трейна и OOF предсказания на трейн.
    # Инициализируем prediction_oof нулями и будем заполнять предсказаниями в процессе валидации
    oof_df = pd.DataFrame()
    oof_df["target"] = train_y
    oof_df["prediction_oof"] = np.zeros(oof_df.shape[0])

    # Список с метриками по фолдам
    metrics = [] 

    # Предсказания на тест. Инициализируем нулями и будем заполнять предсказаниями в процессе валидации.
    # Наши предсказания будут усреднением n_splits моделей
    pred_test = np.zeros(df_test.shape[0])

    
    
    # Кросс-валидация
    for i, (train_index, valid_index) in enumerate( kf.split(df_train, train_y)):
        #print(f"fold_{i} started")

        X_train = df_train.loc[train_index]
        y_train = train_y.loc[train_index].values

        X_valid = df_train.loc[valid_index]
        y_valid = train_y.loc[valid_index].values

        model_kf = deepcopy(model)

        model_kf.fit(X_train, y_train)

        prediction_kf = model_kf.predict(df_test)
        pred_test += prediction_kf / n_splits

        prediction = model_kf.predict(X_valid)
        oof_df.loc[valid_index, "prediction_oof"] = prediction

        cur_metric = metric(y_valid, prediction)
        metrics.append(cur_metric)
        #print(f"metric_{i}: {cur_metric}")

        #print()
        #print("_" * 100)
        #print()

    print(f"metric_OOF: {metric(train_y, oof_df['prediction_oof'])}")
    print(f"metric_AVG: {np.mean(metrics)}")
    print(f"metric_std: {np.std(metrics)}")
    print()
    print("*" * 100)
    print()

    return pred_test, oof_df

In [10]:

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import mean_squared_error,r2_score 



In [11]:
params = {'n_estimators':120, 'learning_rate':0.04}

### Тренируем
 
model_gbr = GradientBoostingRegressor(**params)
#for i in range (7,15):
#    print(i)
pred_test, oof_df = cv_and_predict(train, test, train_y, model_gbr, n_splits=11)



metric_OOF: 0.04040326167535003
metric_AVG: 0.03930257146408231
metric_std: 0.009286262405956097

****************************************************************************************************



In [12]:
submission = pd.DataFrame()
submission["id"] = test_ids
submission["charges"] = np.expm1(pred_test)  

In [13]:
submission.head()


Unnamed: 0,id,charges
0,770,9194.208538
1,771,5723.4903
2,772,25350.984022
3,773,8955.802227
4,774,30511.231024


In [14]:
submission.to_csv("submission.csv", index=False)