In [1]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrix
from pyearth import Earth
import random
import itertools

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, \
roc_curve, auc, precision_score, recall_score, confusion_matrix

from sklearn import impute
from sklearn import metrics
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA


from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

## Data Cleaning and Preprocessing

In [2]:
# Read the train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Split data into X and y
y_train = train.y
X_train = train.drop(['id', 'y'], axis=1) 
X_test = test.drop('id', axis=1)

# Take log of y due to skew for later prediction
y_train_log = np.log(y_train)

# Impute with KNNImputer using k=8
imputer = impute.KNNImputer(n_neighbors=8, weights="uniform")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test) 

# Turn back into pandas df
X_train_imputed = pd.DataFrame(X_train_imputed) 
X_train_imputed.columns = X_train.columns
X_test_imputed = pd.DataFrame(X_test_imputed)
X_test_imputed.columns=X_test.columns

# Scale with Standard Scaler
scaler = StandardScaler().fit(X_train_imputed)
X_train_scaled = scaler.transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Turn back into pandas df
X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.columns = X_train.columns
X_test_scaled = pd.DataFrame(X_test_scaled)
X_test_scaled.columns=X_test.columns

# Find meaningless cols (with all the same value)
same_val_cols = [col for col in X_train_scaled.columns if X_train_scaled[col].nunique() == 1]

# Drop meaningless cols
X_train_scaled.drop(same_val_cols, axis=1, inplace=True)
X_test_scaled.drop(same_val_cols, axis=1, inplace=True)

# Drop duplicate cols from train & test
X_train_scaled = X_train_scaled.T.drop_duplicates().T
X_test_scaled = X_test_scaled.T.drop_duplicates().T

## Feature selection/reduction

In [3]:
# Develop a MARS model for feature selection
mars_model = Earth(max_terms=1000, feature_importance_type='rss', max_degree=1)
mars_model.fit(X_train_scaled, y_train)

# Get the feature importances from the MARS model
importances = mars_model.feature_importances_

# Get the indices of features with importances > 0
idx = list(np.where(importances != 0)[0])

# Filter the datasets for the important features
X_train_mars = X_train_scaled.iloc[:, idx]
X_test_mars = X_test_scaled.iloc[:, idx]

## Final model and predictions

In [4]:
# CatBoostRegressor
model_cat = CatBoostRegressor(verbose = False).fit(X_train_mars, y_train_log)

# Use a bagged CatRegressor model
bagged_model = BaggingRegressor(base_estimator=model_cat, 
                                n_estimators=20, 
                                random_state=1,
                                n_jobs=-1).fit(X_train_mars, y_train_log)

# Make predictions using bagged model
y_pred = np.exp(bagged_model.predict(X_train_mars))
intercept = np.mean(y_train-y_pred)
final_pred = np.exp(bagged_model.predict(X_test_mars))+intercept

# Create predictions df
predictions = pd.DataFrame({"id":test.iloc[:, 0], "y":final_pred})

# Clip the predicted y-values in case they are out of range
predictions['y'] = predictions['y'].clip(lower=1, upper=100)

# Export the predictions as a csv file for Kaggle submission
predictions.to_csv("regression_preds_final.csv", index=False)