In [1]:
import os
os.getcwd()
os.chdir("C:\\Users\cathy\OneDrive\Documents")

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor
from sklearn.feature_selection import RFE
from sklearn import impute
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [3]:
# Set the random seed for reproducibility
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

In [4]:
# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Data Cleaning

In [5]:
# Split data into target and features
y_train = train['y']
X_train = train.drop(['id', 'y'], axis=1) 
X_test = test.drop('id', axis=1)

In [6]:
# Impute missing values
KNN = impute.KNNImputer(n_neighbors = 8)
X_train_i = pd.DataFrame(KNN.fit_transform(X_train))
X_test_i = pd.DataFrame(KNN.transform(X_test))

In [7]:
X_train_i.columns = X_train.columns
X_test_i.columns = X_test.columns

In [8]:
# Scale data
scaler = StandardScaler().fit(X_train_i)
X_train_cleaned = pd.DataFrame(scaler.transform(X_train_i))
X_test_cleaned = pd.DataFrame(scaler.transform(X_test_i))

In [9]:
X_train_cleaned.columns = X_train.columns
X_test_cleaned.columns = X_test.columns

In [10]:
# Rows with all the same value
drop_rows = []

for col in X_train_cleaned.columns:
    if X_train_cleaned[col].nunique() == 1:
        drop_rows.append(col)
        
# Drop
X_train_cleaned.drop(drop_rows, axis = 1, inplace = True)
X_test_cleaned.drop(drop_rows, axis = 1, inplace = True)

In [11]:
# Drop duplicates
X_train_cleaned = X_train_cleaned.T.drop_duplicates().T
X_test_cleaned = X_test_cleaned.T.drop_duplicates().T

## Feature Selection

In [12]:
# Create a RandomForestRegressor model
rf_model = RandomForestRegressor(n_jobs = -1)

# Fit the model to the training data
rf_model.fit(X_train_cleaned, y_train)

# Get the feature importances
importances = rf_model.feature_importances_

# Get the indices of features with non-zero importances
selected_indices = np.where(importances != 0)[0]

# Filter the datasets for the important features
X_train_feat = X_train_cleaned.iloc[:, selected_indices]
X_test_feat = X_test_cleaned.iloc[:, selected_indices]

## CatBoost model (BaggingRegressor)

In [13]:
# Base CatBoostRegressor model
base_model = CatBoostRegressor(verbose=False)

# BaggingRegressor model
bagged_model = BaggingRegressor(base_estimator=base_model, n_estimators=20, random_state=1)

# Fit Bagged model
bagged_model.fit(X_train_feat, np.log(y_train))




In [14]:
# Make predictions
y_pred = np.exp(bagged_model.predict(X_train_feat))
test_predict = bagged_model.predict(X_test_feat)

catboost_pred = np.exp(test_predict) + np.mean(y_train - y_pred)

In [15]:
catboost_pred

array([5.51075704, 7.45518683, 4.4238378 , ..., 8.71784289, 9.22645249,
       7.61973604])

In [16]:
# Create predictions df
predictions = pd.DataFrame({"id":test.iloc[:, 0], "y":catboost_pred})

# Clip the predicted y-values in case they are out of range
predictions['y'] = predictions['y'].clip(lower=1, upper=100)

# Export the predictions as a csv file for Kaggle submission
predictions.to_csv("6_2_2.csv", index=False)