In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import IsolationForest


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



In [4]:
server = False

if(server):
    X_small = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv', index_col= 'id')
    X_test = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv', index_col= 'id')
    X_extra = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv', index_col= 'id')
else:
    X_small = pd.read_csv('data/train.csv', index_col= 'id')
    X_test = pd.read_csv('data/test.csv', index_col= 'id')
    X_extra = pd.read_csv('data/training_extra.csv', index_col= 'id')

X_full = pd.concat([X_small, X_extra], ignore_index=True)

# Remove rows with missing values for price
X_full.dropna(axis=0,subset=['Price'],inplace=True)

# Outlier detection
# iso = IsolationForest(contamination=0.01)
# yhat = iso.fit_predict(X_full)
# mask = yhat != -1
# X_full, y = X_full[mask], y[mask]

# Remove rows with missing values for all
#X_full.dropna(axis=0,subset=['Price','Brand','Material','Size', 'Compartments', 'Laptop Compartment',
#                            'Waterproof','Style', 'Color', 'Weight Capacity (kg)'],inplace=True)

# Seperate Targets
y = X_full.Price
X_full.drop(['Price'],inplace=True, axis=1)


X_full.isna().sum()




Brand                   126758
Material                110962
Size                     87785
Compartments                 0
Laptop Compartment       98533
Waterproof               94324
Style                   104180
Color                   133617
Weight Capacity (kg)      1808
dtype: int64

In [5]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 9 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Brand                 object 
 1   Material              object 
 2   Size                  object 
 3   Compartments          float64
 4   Laptop Compartment    object 
 5   Waterproof            object 
 6   Style                 object 
 7   Color                 object 
 8   Weight Capacity (kg)  float64
dtypes: float64(2), object(7)
memory usage: 274.3+ MB


In [6]:
# Break validiation set from train data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.9,test_size=0.1)


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
preprocessor


In [8]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1500, learning_rate=0.05)

In [9]:
import sklearn

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])

# 1. Fit the preprocessor on X_train:
preprocessor.fit(X_train)

# 2. Transform the validation set:
X_valid_transformed = preprocessor.transform(X_valid)

# Preprocessing of training data, fit model 
my_pipeline.fit(
    X_train, y_train,
    model__early_stopping_rounds=20,
    model__eval_set=[(X_valid_transformed, y_valid)],
    model__verbose=False
)

# Define the parameter grid
param_grid = {
    'model__n_estimators': [100, 500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1]
}

# Set up the k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Set up the grid search
grid_search = GridSearchCV(my_pipeline, param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=3)


# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters found: ', best_params)

# Evaluate the model 
preds = grid_search.predict(X_valid)
score = np.sqrt(sklearn.metrics.mean_squared_error(preds, y_valid))
print('RMSE:', score)




Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END model__learning_rate=0.01, model__n_estimators=100;, score=-1517.242 total time=   8.9s
[CV 2/5] END model__learning_rate=0.01, model__n_estimators=100;, score=-1513.539 total time=   8.4s
[CV 3/5] END model__learning_rate=0.01, model__n_estimators=100;, score=-1516.189 total time=   8.5s
[CV 4/5] END model__learning_rate=0.01, model__n_estimators=100;, score=-1512.084 total time=   8.6s
[CV 5/5] END model__learning_rate=0.01, model__n_estimators=100;, score=-1514.662 total time=   8.4s
[CV 1/5] END model__learning_rate=0.01, model__n_estimators=500;, score=-1516.396 total time=  23.4s
[CV 2/5] END model__learning_rate=0.01, model__n_estimators=500;, score=-1512.531 total time=  23.2s
[CV 3/5] END model__learning_rate=0.01, model__n_estimators=500;, score=-1515.204 total time=  23.3s
[CV 4/5] END model__learning_rate=0.01, model__n_estimators=500;, score=-1511.145 total time=  23.3s
[CV 5/5] END model__learning_ra

In [10]:
if(server):
    # Predict on test set
    test_pred = my_pipeline.predict(X_test)

    output = pd.DataFrame({'id': X_test.index,
                        'Price': test_pred})

    output.to_csv('submission.csv', index=False)