## Setup
---

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, make_scorer
from scipy import stats
from pandas.plotting import scatter_matrix
import subprocess
%matplotlib inline

## Save Kaggle submission file
---

In [2]:
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["SalePrice"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldn’t save submission.")
    else:
        print("Submission saved.")

## Submit score to Kaggle
---

In [3]:
def submit_score_to_kaggle(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "house-prices-advanced-regression-techniques",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)

## Get the Data
---

### *Get training data*

In [4]:
def load_train_data(split=True):
    target = "SalePrice"
    data = pd.read_csv("./train.csv", index_col="Id")
    features = [column for column in data.columns if not column == target]
    print("load_train_data: done")
    
    if split:
        return data[features], data[target]
    else:
        return data

In [5]:
X_train, y_train = load_train_data()

load_train_data: done


### *Get test data*

In [6]:
def load_x_test():
    return pd.read_csv("./test.csv", index_col="Id");

In [7]:
def load_y_true():
    y_test = pd.read_csv("./solution.csv", index_col="Id")
    return y_test

In [8]:
def load_test_data(split=True):
    X_test = pd.read_csv("./test.csv", index_col="Id")
    y_test = load_y_true()
    print("load_test_data: done")
    
    if split:
        return X_test, y_test
    else:
        return pd.concat([X_test, y_test], axis="columns")
    

In [9]:
X_test, y_test = load_test_data()

load_test_data: done


## Prepare the Data
---

In [10]:
def split_features_target(df):
    target = "SalePrice"
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

In [11]:
def clean_data(X):
    X = drop_attributes(X, threshold=0.05)
    X = complete_missing_values(X)
    return X

In [12]:
def prepped_train_data():
    X, y = load_train_data()
    X, y = remove_outliers(X, y)
    X = clean_data(X)
    return X, y

In [13]:
def prepped_data():
    # Load data
    df_train = load_train_data(split=False)
    df_test = load_test_data(split=False)

    # Drop attributes with a lot of missing values, fill remaining missing values
    df_train = clean_data(df_train)
    df_test = clean_data(df_test)

    # Add labels so you can split the DataFrames later
    df_train["Label"] = "Train"
    df_test["Label"] = "Test"

    # Concat train and test data
    df_all_data = pd.concat([df_train, df_test])
    df_all_data = encode_categorical_attributes(df_all_data)

    # Split data back into separate DataFrames
    df_train = df_all_data[df_all_data["Label_Train"] == 1]
    df_test = df_all_data[df_all_data["Label_Test"] == 1]

    # Drop the labels
    df_train = df_train.drop(["Label_Train", "Label_Test"], axis="columns")
    df_test = df_test.drop(["Label_Train", "Label_Test"], axis="columns")

    X_train, y_train = split_features_target(df_train)
    X_test, y_test = split_features_target(df_test)

    X_train, y_train = remove_outliers(X_train, y_train)
    
    return X_train, y_train, X_test, y_test

In [14]:
def prepped_data_linear_regression():
    # Load data
    df_train = load_train_data(split=False)
    df_test = load_test_data(split=False)

    # Drop attributes with a lot of missing values, fill remaining missing values
    df_train = clean_data(df_train)
    df_test = clean_data(df_test)

    # Add labels so you can split the DataFrames later
    df_train["Label"] = "Train"
    df_test["Label"] = "Test"

    # Concat train and test data
    df_all_data = pd.concat([df_train, df_test])
    df_all_data = encode_categorical_attributes(df_all_data)

    # Split data back into separate DataFrames
    df_train = df_all_data[df_all_data["Label_Train"] == 1]
    df_test = df_all_data[df_all_data["Label_Test"] == 1]

    # Drop the labels
    df_train = df_train.drop(["Label_Train", "Label_Test"], axis="columns")
    df_test = df_test.drop(["Label_Train", "Label_Test"], axis="columns")

    X_train, y_train = split_features_target(df_train)
    X_test, y_test = split_features_target(df_test)

    X_train, y_train = remove_outliers(X_train, y_train)
    
    return X_train, y_train, X_test, y_test

### 1. Data cleaning

In [15]:
def remove_outliers(X, y):
    # Remove the observations where GrLivArea is unusually high
    outliers = X[X["GrLivArea"] > 4000]
    X = X.drop(outliers.index)
    y = y.drop(outliers.index)
    print("remove_outliers: done")
    return X, y

#### *Drop attributes that have too many missing values*

In [16]:
def na_info(df):
    total = df.isnull().sum()
    percentage = (total / len(df))
    na_info = pd.concat([total, percentage], axis="columns", keys=["Total", "Percentage"])
    return na_info.sort_values(by="Total", ascending=False)

In [17]:
def drop_attributes(df, threshold=0.05):
    missing_info = na_info(df)
    columns_to_drop = missing_info[missing_info["Percentage"] > threshold].index
    print(f"Dropping {len(columns_to_drop)} columns: {columns_to_drop}")
    return df.drop(columns_to_drop, axis="columns")

#### *Fill in missing values or drop their rows*

In [18]:
def complete_missing_values(X):
    X_num = X.select_dtypes(np.number)
    X_cat = X.select_dtypes(object)
    imp_num = SimpleImputer(strategy="median")
    imp_cat = SimpleImputer(strategy="most_frequent")
    array_num = imp_num.fit_transform(X_num)
    array_cat = imp_cat.fit_transform(X_cat)
    X_num_transformed = pd.DataFrame(array_num, columns=X_num.columns, index=X_num.index)
    X_cat_transformed = pd.DataFrame(array_cat, columns=X_cat.columns, index=X_cat.index)
    X_transformed = pd.concat([X_num_transformed, X_cat_transformed], axis="columns")
    print("complete_missing_values: done")
    return X_transformed

### 2. Feature selection

#### *Drop attributes that don’t provide useful information for house price prediction*

### 3. Feature engineering

#### *Check if any numerical continuous features should be discretized*

#### *Discretize continuous features if needed*

#### *Decompose categorical features*

In [19]:
def encode_categorical_attributes(X):
    print("encode_categorical_attributes: done")
    return pd.get_dummies(X)

#### *Add promising transformations of features (e.g., log( x ), sqrt( x ), x 2 , etc.).*

#### *Check if it would make sense to aggregate features into new features*

#### *Aggregate features into new features, if it makes sense*

### 4. Feature scaling

#### *If your model requires it, standardize or normalize features*

## Calculate score
---

In [20]:
def root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=False):
    if transform_negative_predictions:
        y_pred_tr = [max(prediction, 0) for prediction in y_pred]
    else:
        y_pred_tr = y_pred
    
    # same as np.sqrt(np.mean(np.power(np.log(np.array(y_pred_tr) + 1) - np.log(np.array(y_true) + 1), 2)))
    return np.sqrt(mean_squared_log_error(y_true, y_pred_tr))

def kaggle_score(y_pred, transform_negative_predictions=False):
    y_true = load_y_true()
    score = root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=transform_negative_predictions)
    return score

def print_kaggle_score(y_pred):
    y_true = load_y_true()
    score = kaggle_score(y_pred)
    print("The score is %.5f" % score)

In [21]:
X_train, y_train, X_test, y_test = prepped_data()

load_train_data: done
load_test_data: done
Dropping 11 columns: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
       'GarageYrBlt', 'GarageCond', 'GarageType', 'GarageFinish',
       'GarageQual'],
      dtype='object')
complete_missing_values: done
Dropping 11 columns: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
       'GarageFinish', 'GarageQual', 'GarageCond', 'GarageYrBlt',
       'GarageType'],
      dtype='object')
complete_missing_values: done
encode_categorical_attributes: done
remove_outliers: done


## Shortlist Promising Models
---

- [todo] train many quick-and-dirty models using standard parameters
- [todo] compute mean and standard deviation of n-fold cross validation performance
- [todo] analyze the importances of attributes for each algorithm
- [todo] analyze the types of errors the models make
- [todo] do a quick round of feature selection and engineering
- [todo] run one or two iterations of the past five steps 
- [todo] shortlist the top three models, preferring models making different errors

### Linear Regression

In [22]:
data_train = load_train_data(split=False)

# Grab numerical attributes only
data_train = data_train.select_dtypes(np.number)
imp = SimpleImputer(strategy="median")
data_train_array = imp.fit_transform(data_train)
data_train = pd.DataFrame(data_train_array, columns=data_train.columns, index=data_train.index)
X_train, y_train = split_features_target(data_train)

X_test, y_test = load_test_data()

X_test = X_test.select_dtypes(np.number)
imp_test = SimpleImputer(strategy="median")
X_test_array = imp_test.fit_transform(X_test)
X_test = pd.DataFrame(X_test_array, columns=X_test.columns, index=X_test.index)

regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

load_train_data: done
load_test_data: done


In [23]:
(y_pred > 0).all()

True

In [24]:
y_pred_df = submission_df(y_pred)
int(y_pred_df.max())

634830

In [25]:
# Compute the estimator score using the estimator’s default metric, R2 score
regr.score(X_test, y_test)

0.8484533797390255

In [26]:
# Confirm that R2 score is the default scoring metric for LinearRegression
r2_score(y_test, y_pred)

0.8484533797390255

In [27]:
mean_absolute_error(y_test, y_pred)

20219.79505456269

In [28]:
mean_squared_error(y_test, y_pred)

981920717.258501

In [29]:
mean_squared_log_error(y_test, y_pred)

0.048715608332027115

In [30]:
root_mean_squared_log_error(y_test, y_pred)

0.2207161261259066

In [31]:
kaggle_score(y_pred)

0.2207161261259066

In [32]:
#submit_score_to_kaggle(y_pred, "linear_regression_submission_01", "Plain numerical-features linear regression")

### Training Linear Regression using Cross-Validation

In [33]:
def regr_data():
    # Get training data
    data_train = load_train_data(split=False)

    # Grab numerical attributes only
    data_train = data_train.select_dtypes(np.number)
    imp = SimpleImputer(strategy="median")
    data_train_array = imp.fit_transform(data_train)
    data_train = pd.DataFrame(data_train_array, columns=data_train.columns, index=data_train.index)
    X_train, y_train = split_features_target(data_train)

    # Get test data
    X_test, y_test = load_test_data()

    X_test = X_test.select_dtypes(np.number)
    imp_test = SimpleImputer(strategy="median")
    X_test_array = imp_test.fit_transform(X_test)
    X_test = pd.DataFrame(X_test_array, columns=X_test.columns, index=X_test.index)
    
    return X_train, y_train, X_test, y_test

In [34]:
# If you don’t specify a scoring parameter, the estimator’s default scorer (if available) is used
# In the case of LinearRegression, that’s the R2 score
# https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score-the-coefficient-of-determination
X_train, y_train, _, _ = regr_data()
regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train)
scores

load_train_data: done
load_test_data: done


array([0.84586559, 0.81392047, 0.82245699, 0.81772022, 0.6291012 ])

In [35]:
X_train, y_train, _, _ = regr_data()
regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train, scoring="neg_mean_absolute_error")
-scores

load_train_data: done
load_test_data: done


array([21734.00826107, 22179.79085408, 21384.20683081, 20465.68469093,
       24264.8332824 ])

In [36]:
X_train, y_train, _, _ = regr_data()
regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train, scoring="neg_mean_squared_error")
(-scores.astype(np.int64))

load_train_data: done
load_test_data: done


array([ 852768619, 1222667164, 1345385102,  923402939, 2487013526])

In [37]:
# Generate a scorer object with a built-in metric function
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

X_train, y_train, _, _ = regr_data()
regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train, scoring=mae_scorer)
-scores

load_train_data: done
load_test_data: done


array([21734.00826107, 22179.79085408, 21384.20683081, 20465.68469093,
       24264.8332824 ])

In [38]:
# Make your own RMSLE (root mean square log error) scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False, transform_negative_predictions=True)

X_train, y_train, _, _ = regr_data()
regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train, scoring=rmsle_scorer)
-scores

load_train_data: done
load_test_data: done


array([0.17580936, 0.90945406, 0.18550033, 0.26147498, 0.19897854])

### Random Forest

In [39]:
X_train, y_train, X_test, y_test = prepped_data()
rf = RandomForestRegressor(random_state=42)
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)
scores = cross_val_score(rf, X_train, y_train, scoring=rmsle_scorer)
-scores

load_train_data: done
load_test_data: done
Dropping 11 columns: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
       'GarageYrBlt', 'GarageCond', 'GarageType', 'GarageFinish',
       'GarageQual'],
      dtype='object')
complete_missing_values: done
Dropping 11 columns: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage',
       'GarageFinish', 'GarageQual', 'GarageCond', 'GarageYrBlt',
       'GarageType'],
      dtype='object')
complete_missing_values: done
encode_categorical_attributes: done
remove_outliers: done


array([0.14050525, 0.1422873 , 0.1446216 , 0.13735748, 0.14099424])

In [40]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
kaggle_score(y_pred)

0.14645785296396835

In [41]:
y_pred_df = submission_df(y_pred)
y_pred_df.describe()

Unnamed: 0,SalePrice
count,1459.0
mean,179004.597087
std,73528.918675
min,58030.1
25%,130545.495
50%,157190.22
75%,209799.845
max,549460.12
