In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn import preprocessing

# Step 2: Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [53]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4


In [55]:
# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

In [22]:
final_preds = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    
    model = XGBRegressor(objective='reg:linear',colsample_bytree=0.2,
                         learning_rate=0.6,max_depth=4, alpha=30,
                         n_estimators=200, tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor')

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

    

0 0.7217766906242847
1 0.7189531278553424
2 0.7233341036720647
3 0.7173989059752421
4 0.7182726515933786


In [24]:
preds = np.mean(np.column_stack(final_preds), axis = 1)

# Standardization

In [56]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# standardization
numerical_cols = [col for col in train.columns if col.startswith('cont')]

scaler = preprocessing.StandardScaler()
xtest[numerical_cols] = scaler.fit_transform(xtest[numerical_cols])


final_preds = []
rmse_valid = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    model = XGBRegressor(colsample_bytree=0.2,
                         learning_rate=0.6,max_depth=4, alpha=30,
                         n_estimators=200, tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor')

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    rmse_valid.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_valid), np.std(rmse_valid))

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,4.0,4.0,8.0,...,-0.064354,-0.585684,-0.619348,-0.988946,-0.218833,-0.847465,-0.897546,-1.738502,-0.772649,1.513157
1,0.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,7.0,...,-0.860319,1.534861,1.573331,1.367321,-0.442598,-0.856575,0.297456,1.945101,1.448489,0.832538
2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,4.0,3.0,10.0,...,0.853577,0.761317,-0.304334,1.566304,-0.927725,-0.214929,0.070668,-0.40242,-0.469709,-0.000919
3,1.0,1.0,0.0,2.0,1.0,3.0,0.0,4.0,0.0,13.0,...,0.94529,1.174686,1.250933,0.489002,2.199541,-0.38399,2.016067,-0.278399,-0.401333,-0.210475
4,1.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,5.0,...,-0.745758,-0.346062,-0.366653,-0.59003,-0.207659,-0.387838,0.686655,-0.395117,-0.495155,1.758523


# log transformation

In [62]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# log transform
for col in numerical_cols:
    train[col] = np.log1p(train[col])
    xtest[col] = np.log1p(xtest[col])
    

final_preds = []
rmse_valid = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    model = XGBRegressor(colsample_bytree=0.2,
                         learning_rate=0.6,max_depth=4, alpha=30,
                         n_estimators=200, tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor')

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    rmse_valid.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_valid), np.std(rmse_valid))

0 0.7228296533219171
1 0.7199266393907172
2 0.7246273177950177
3 0.7182436439446709
4 0.719070261866637
0.720939503263792 0.0024066681264212305


# Polynomial features

In [None]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# log transform
for col in numerical_cols:
    train[col] = np.log1p(train[col])
    xtest[col] = np.log1p(xtest[col])
    

final_preds = []
rmse_valid = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    model = XGBRegressor(colsample_bytree=0.2,
                         learning_rate=0.6,max_depth=4, alpha=30,
                         n_estimators=200, tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor')

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    rmse_valid.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_valid), np.std(rmse_valid))

In [27]:
# Save predictions to a CSV file
submission_data.target = preds
submission_data.to_csv('submission.csv', index=False)