In [1]:
## Importing libraries
import time
import warnings

import eli5

warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use("ggplot")
%matplotlib inline


import optuna
from optuna.integration import LightGBMPruningCallback

optuna.logging.set_verbosity(optuna.logging.WARNING)

import plotly.express as px
from catboost import CatBoostRegressor, Pool
from lightgbm import DaskLGBMRegressor, LGBMRegressor
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, RepeatedKFold, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, RobustScaler
from xgboost import DMatrix, XGBRegressor

  from pandas import MultiIndex, Int64Index


In [4]:
!pwd

/home/balde/Desktop/Python/ELP/Model


## Reading the data

In [3]:
train = pd.read_csv("Model/Train.csv")

test = pd.read_csv("Model/est.csv")
sub = pd.read_csv("Model/SampleSubmission (1).csv")
vard = pd.read_csv("Model/VariableDescription.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Model/Train.csv'

In [None]:
train.head()

In [None]:
train.info()

In [None]:
df_train = train.copy()
df_test = test.copy()

In [None]:
# train.columns[train.columns.str.contains("observe")].to_list()
observe_columns = [
    "child_observe_attentive",
    "child_observe_concentrated",
    "child_observe_diligent",
    "child_observe_interested",
]

mapper_observe = {"Sometimes": 1, "Almost never": 0, "Often": 2, "Almost always": 3}

In [None]:
social_columns = [
    "teacher_social_initiative",
    "teacher_social_peers",
    "teacher_social_nonaggressive",
    "teacher_social_cooperate",
    "teacher_social_assistance",
    "teacher_social_ideas",
    "teacher_social_initiative",
]

mapper_social = {
    "None of the time": 0,
    "A little of the time": 1,
    "Most of the time": 2,
    "All of the time": 3,
}

In [None]:
# Function to map values to the columns
def mapper(df, map_columns, map_values):
    for col in map_columns:
        df[col] = df[col].map(map_values)
    return df

In [None]:
# Mapping values
df_train = mapper(df_train, observe_columns, mapper_observe)
df_test = mapper(df_test, observe_columns, mapper_observe)

df_train = mapper(df_train, social_columns, mapper_social)
df_test = mapper(df_test, social_columns, mapper_social)

In [None]:
df_train.columns[df_train.columns.str.contains("child")]

In [None]:
# map_years = {'1st year in the programme': 1,
#              '2nd year in programme': 2,
#              '3rd year in programme': 3,
#              'Do Not Know':0}

# df_train["child_years_in_programme"] = df_train["child_years_in_programme"].map(map_years)
# df_test["child_years_in_programme"] = df_test["child_years_in_programme"].map(map_years)

In [None]:
drop_cols = [
    "count_staff_gender_other",
    "pri_days",
    "obs_lighting_8",
    "count_practitioners_all",
    "count_register_gender_other",
    "obs_heating_3",
    "obs_lighting_5",
    "pri_difficult_see",
    "pri_difficult_walk",
    "obs_lighting_6",
    "teacher_social_initiative",
]
df_train[drop_cols].isnull().sum()

In [None]:
len(df_train.query("child_gender == 'Female'"))

In [None]:
# df_train["child_gender"] = df_child["child_gender"].map({"Male": })

In [None]:
# pd.set_option("display.max_rows", 50)
# df_train["child_dob"] =
# df_train["child_dob"] = pd.to_datetime(df_train["child_dob"],yearfirst=True)
# df_test["child_dob"] = pd.to_datetime(df_test["child_dob"],yearfirst=True)

# df_train["child_dob_month"] = df_train["child_dob"].dt.month
# df_test["child_dob_month"] = df_test["child_dob"].dt.month

# Machine Learning Algorithms

## Training with numerical columns only

In [None]:
n_splits = 10

kf = KFold(n_splits, shuffle=True, random_state=42)
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=2, random_state=42)

In [None]:
# # Cross validation
def cross_validation(X, y, model, cv_method):
    rmse_scores = list()

    # cv = cv_method(n_splits = N_SPLITS, n_repeats=5, random_state=1121218)

    for idx, (train_idx, test_idx) in enumerate(cv_method.split(X, y)):

        print("=" * 12 + f"Training fold {idx}" + 12 * "=")
        start = time.time()

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Making eval_set
        eval_set = [(X_test.values, y_test)]

        # Fitting the model to the data
        model.fit(X_train.values, y_train)

        # Predictions
        preds = model.predict(X_test)

        # Loss
        rmse = mean_squared_error(y_test, preds, squared=False)
        rmse_scores.append(rmse)

        runtime = time.time() - start

        print(f"Fold {idx} finished with score: {rmse:.5f} in {runtime:.2f} seconds.\n")

    print(f"Average score {np.mean(rmse_scores)}")

    return np.mean(rmse_scores), rmse_scores


# rmse, cv_scores = cross_validation(X, y, hgbr, k_fold)

In [None]:
# Getting feature importances
def get_feature_imp(model, imp_factor):
    cols_imp = [
        (col, imp)
        for col, imp in zip(model.feature_name_, model.feature_importances_)
        if imp < imp_factor
    ]
    cols = [
        col
        for col, imp in zip(model.feature_name_, model.feature_importances_)
        if imp < imp_factor
    ]

    return cols, cols_imp

In [None]:
def feature_imp_dataframe(model):
    feature_imp = model.feature_importances_
    columns = model.feature_name_

    df_imp = pd.DataFrame({"features": columns, "feature_importance": feature_imp})

    df_imp.sort_values(by=["feature_importance"], ascending=False, inplace=True)
    df_imp.reset_index(drop=True, inplace=True)

    return df_imp

In [None]:
def plot_feature_importance(df_feature_imp, n_imp_cols):
    labels = df_feature_imp.iloc[:30, 0].values

    plt.figure(figsize=(20, 6))
    ax = df_feature_imp.iloc[:30, 1].plot(kind="bar", title="Feature Importance")
    ax.set_xticklabels(labels)
    plt.show()

### Selecting numerical columns from df_train and df_test

### Lightgbm

In [None]:
cols_list_2 = [
    "pri_difficult_see",
    "count_staff_contract_substitute",
    "pri_difficult_walk",
    "obs_lighting_4",
    "obs_heating_3",
    "count_register_gender_other",
    "count_staff_paid_managers",
    "obs_lighting_5",
    "count_practitioners_all",
    "obs_cooking_4",
    "pri_days",
    "count_staff_gender_other",
    "obs_cooking_5",
    "obs_lighting_6",
    "obs_lighting_3",
]

cols_list_3 = [
    "pri_difficult_see",
    "count_staff_contract_substitute",
    "pri_difficult_walk",
    "obs_lighting_4",
    "obs_heating_3",
    "count_register_gender_other",
    "count_staff_paid_managers",
    "obs_lighting_5",
    "count_practitioners_all",
    "obs_cooking_4",
    "pri_days",
    "count_staff_gender_other",
    "obs_cooking_5",
    "obs_lighting_6",
    "obs_lighting_3",
    "language_match",
    "teacher_social_initiative",
    "obs_lighting_8",
    "obs_heating_7",
]

In [None]:
train_int_cols = df_train.select_dtypes(
    include=["float64", "number", "Int64", "int64"]
).columns
test_int_cols = df_test.select_dtypes(
    include=["float64", "number", "Int64", "int64"]
).columns

# Numerical columns
num_cols = list(set(test_int_cols).intersection(set(train_int_cols)))


X, y = df_train[num_cols].drop(columns=drop_cols), df_train[["target"]]
# X, y = df_train[num_cols].drop(columns=cols_list_2), df_train[['target']]
# X, y = df_train[num_cols].drop(columns=cols_list_3), df_train[['target']]

# Selecting the features for test-data
# Prediction on test set
test_data = df_test[X.columns.to_list()]

In [None]:
X.shape, y.shape

In [None]:
X.head()

# Capturing NaN per row and making a new feature

In [None]:
def feature_engineering(df):
    df["NaN_row"] = df.isna().sum(axis=1)
    df["std"] = df.std(axis=1)
    return df


X = feature_engineering(X)
test_data = feature_engineering(test_data)

# Filling missing values

In [None]:
pipeline = Pipeline(
    [
        ("impute", SimpleImputer()),
        ("scale", MinMaxScaler())
        # ('quantiletransform', QuantileTransformer(random_state=seed))
    ]
)

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test_data = pd.DataFrame(columns=test_data.columns, data=pipeline.transform(test_data))

In [None]:
X.head()

In [None]:
(X.var()[:-50])

In [None]:
params = {
    "learning_rate": 0.01,
    # "objective": "regression",
    "objective": "tweedie",
    "n_estimators": 4000,
    "num_leaves": 35,
    "max_bin": 240,
    "colsample_bytree": 0.3,
    "max_depth": 9,
    "min_child_samples": 20,
    "random_state": 42,
}

In [None]:
best_trial = {
    "n_estimators": 54884,
    "reg_alpha": 1.511619688143909,
    "reg_lambda": 0.46134859372486536,
    "colsample_bytree": 0.3,
    "subsample": 1.0,
    "learning_rate": 0.02,
    "max_depth": 20,
    "num_leaves": 16,
    "min_child_samples": 21,
    "min_data_per_groups": 48,
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=55
)

# Training the model with reduced columns
lgbm = LGBMRegressor(**params)

lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="rmse",
    early_stopping_rounds=100,
    verbose=False,
)

# Get score
lgbm.best_score_

In [None]:
get_feature_imp(lgbm, 5)

In [None]:
# Training the model on whole dataset
lgbm = LGBMRegressor(**params)
lgbm.fit(X, y)

In [None]:
fea_imp = pd.DataFrame({"imp": lgbm.feature_importances_, "col": X.columns})
fea_imp = fea_imp.sort_values(["imp", "col"], ascending=True).iloc[-30:]
_ = fea_imp.plot(kind="barh", x="col", y="imp", figsize=(20, 10))
plt.savefig("catboost_feature_importance.png")

In [None]:
# Prediction on test set
# test_data = df_test[X.columns.to_list()]

preds = lgbm.predict(test_data)
preds

In [None]:
# Creating a submissionfile
sub_file = pd.read_csv("GBRSubmission1_score_10.63.csv")
sub_file.target = preds
sub_file.to_csv("LGBMSubmission.csv", index=False)

In [None]:
kjkjkd

In [None]:
import optuna
from sklearn.model_selection import train_test_split


def objective(trial, data=X, target=y):

    train_x, test_x, train_y, test_y = train_test_split(
        data, target, test_size=0.2, random_state=42
    )
    param = {
        "metric": "rmse",
        "random_state": 42,
        # "n_estimators": 20000,
        "n_estimators": trial.suggest_int("n_estimators", 100, 100_000),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
        "colsample_bytree": trial.suggest_categorical(
            "colsample_bytree", [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        ),
        "subsample": trial.suggest_categorical(
            "subsample", [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
        ),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", [0.001, 0.008, 0.01, 0.014, 0.017, 0.02, 0.03, 0.04, 0.05]
        ),
        "max_depth": trial.suggest_categorical(
            "max_depth",
            [
                6,
                7,
                8,
                8,
                10,
                11,
                13,
                14,
                15,
                20,
            ],
        ),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "cat_smooth": trial.suggest_int("min_data_per_groups", 1, 100),
    }
    model = LGBMRegressor(**param)

    model.fit(
        train_x,
        train_y,
        eval_set=[(test_x, test_y)],
        early_stopping_rounds=100,
        verbose=False,
    )

    preds = model.predict(test_x)

    rmse = mean_squared_error(test_y, preds, squared=False)

    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)