In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# read data 
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')


In [3]:
# convert target variable to log
# log conversion normalizes data (also is where we measure RMSE)
df_train['SalePrice']= np.log(df_train['SalePrice'])
# there is no SalePrice variable for test (only Kaggle has)

# drop 'Id' column as it is useless
df_train = df_train.drop(columns = ['Id'])
df_test = df_test.drop(columns = ['Id'])


In [4]:
# remove NaN

# create imputer 
my_imputer = SimpleImputer(strategy='constant')

# impute column by column
for col in df_train.columns:
    df_train[col] = my_imputer.fit_transform(df_train[[col]]).ravel()
    if col == 'SalePrice': # doesn't exist in test, skip
        continue
    df_test[col] = my_imputer.fit_transform(df_test[[col]]).ravel()

In [5]:
# Apply one-hot encoding
# apply same encoding to test as training
# this drops some data from test because we have extra categories there

# Identify categorical columns
categorical_cols = df_train.select_dtypes(include=['object']).columns

# Create OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown = 'ignore')  # drop='first' to avoid multicollinearity

# train encoder on training data
encoder.fit(df_train[categorical_cols])

# Fit and transform the encoder on the categorical columns in df_train
df_train_encoded = pd.DataFrame(encoder.transform(df_train[categorical_cols]))
df_train_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Concatenate the one-hot encoded columns with the original DataFrame
df_train = pd.concat([df_train, df_train_encoded], axis=1)

# Drop the original categorical columns as they are no longer needed
df_train = df_train.drop(categorical_cols, axis=1)

# Now, perform the same one-hot encoding for df_test
# Note: Use transform instead of fit_transform to apply the same encoding as df_train
df_test_encoded = pd.DataFrame(encoder.transform(df_test[categorical_cols]))
df_test_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Concatenate the one-hot encoded columns with the original DataFrame
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# Drop the original categorical columns as they are no longer needed
df_test = df_test.drop(categorical_cols, axis=1)

# Now, df_train and df_test have the same columns after one-hot encoding




In [10]:
def get_trained_xgboost(df_train):
    # target and features
    X = df_train.drop(columns = ['SalePrice'])
    y = df_train['SalePrice']

    # Create an XGBoost regressor model
    xg_reg = xgb.XGBRegressor(
        objective ='reg:squarederror',
        colsample_bytree = 0.3,
        learning_rate = 0.1,
        max_depth = 5,
        alpha = 10,
        n_estimators = 10
    )

    # train and return model
    xg_reg.fit(X,y)
    return xg_reg

xg_reg = get_trained_xgboost(df_train)

In [33]:
y_pred = np.exp(xg_reg.predict(df_test)).reshape(-1,1)

In [34]:
start_idx = 1461
# Create a DataFrame with the reshaped array and the desired index
df_pred = pd.DataFrame(y_pred, 
                      index=np.arange(start_idx, start_idx + len(y_pred)),
                      columns=['SalePrice'])
df_pred.index.name = 'Id'

# Save the DataFrame to a CSV file
df_pred.to_csv('your_file_name.csv')

In [25]:
your_array.shape

(5,)

In [24]:
your_array = np.array([1, 2, 3, 4, 5])


In [27]:
y_pred.shape

(1459,)