In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

In [2]:
# importing training data
training_data = pd.read_csv('train.csv')

# display the top 10 data rows
training_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
# importing test_data
test_data = pd.read_csv('test.csv')
test_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [5]:
# calculating the mean value of training data SalePrice
mean_pred = training_data['SalePrice'].mean()

# assigning the mean value as predictions to test data
mean_preds = [mean_pred for _ in range(len(test_data))]

# importing true values of the test data
submission_sample = pd.read_csv('sample_submission.csv')
true = submission_sample['SalePrice']

### Calculating Baseline Accuracy

In [6]:
# calculate baseline rmse and rmsle accuracies
baseline_rmse_accuracy = np.sqrt(mean_squared_error(mean_preds, true))
baseline_rmsle_accuracy = np.sqrt(mean_squared_log_error(mean_preds, true))
print('Mean Baseline  RMSE: {:.4f}'.format(baseline_rmse_accuracy))
print('Mean Baseline RMSLE: {:.4f}'.format(baseline_rmsle_accuracy))

Mean Baseline  RMSE: 16603.7783
Mean Baseline RMSLE: 0.0911


In [7]:
# dividing data into features and target variable
X_train = training_data.drop('SalePrice', axis = 1)
y_train = training_data.SalePrice

# test data features and the target variable
X_test = test_data
y_test = true

### Pre Processing Pipeline

In [8]:
# extracting the numerical and categorical data
num_data = training_data.select_dtypes(exclude="object")
cat_data = training_data.select_dtypes(include="object")

# finding the numerical and categorical features
num_features = num_data.columns.to_list() 
cat_features = cat_data.columns.to_list()

# removing the target variable(SalePrice) from the features
num_features.remove('SalePrice')

In [9]:
# Preprocessing pipeline for numerical features
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining the two preprocessing pipelines using ColumnTransfer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

### Features Selection

The features are selected based on the correlation values using SelectKBest in combination with f_regression. Top 25 variables will be selected based on the prementioned criteria.

### Linear Regression

In [10]:
# steps in the linear regression pipeline (preprocessing and prediction)
steps_linear_reg=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('lin_reg', LinearRegression())
    ]

# defining the pipeline
lin_reg_pipeline = Pipeline(steps_linear_reg)

# training on the data set
lin_reg_pipeline.fit(X_train, y_train)

# predicting the test data
y_pred_lr = lin_reg_pipeline.predict(X_test)

# calculating rmse for linear regression model
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

# calculating rmsle for linear regression model
rmsle_lr = np.sqrt(mean_squared_log_error(y_test, y_pred_lr))

print('Linear Regression RMSE: {:.4f}'.format(rmse_lr))
print('Linear Regression RMSLE: {:.4f}'.format(rmsle_lr))

Linear Regression RMSE: 70339.9407
Linear Regression RMSLE: 0.3969


### SGD Regressor

In [11]:
# setting parameters distribution for hyper-parameter tuning
sgd_param_grid = {
        'sgd_reg__max_iter':[100000, 1000000], 
        'sgd_reg__tol':[1e-10, 1e-3],
        'sgd_reg__eta0':[0.001, 0.01]
    }

# SGD Regression Pipeline
sgd_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('sgd_reg', SGDRegressor()),
    ]
)

np.seterr(invalid='ignore')

# sgd regression
sgd_grid_search = GridSearchCV(sgd_reg_pipeline, sgd_param_grid, cv=10)

# training on the data set
sgd_grid_search.fit(X_train, y_train)

# predicting the test data
y_pred_sgd = sgd_grid_search.predict(X_test)

# calculating rmse for SGD regression model
rmse_sgd = np.sqrt(mean_squared_error(y_test, y_pred_sgd))

# calculating rmsle for SGD regression model
rmsle_sgd = np.sqrt(mean_squared_log_error(y_test, abs(y_pred_sgd)))

print('SGD Regression RMSE: {:.4f}'.format(rmse_sgd))
print('SGD Regression RMSLE: {:.4f}'.format(rmsle_sgd))

SGD Regression RMSE: 73776.4919
SGD Regression RMSLE: 0.4492


### Parameters Meaning
#### max_iter:
The maximum number of passes over the training data.

#### tol:
The stopping criterion.

#### eta0
The initial learning rate for the ‘constant’, ‘invscaling’ or ‘adaptive’ schedules.

### SVM Regressor

In [12]:
# setting parameters distribution for hyper-parameter tuning
svm_param_grid = {
        'svm_reg__kernel':['linear', 'poly', 'rbf'], 
        'svm_reg__C':[1, 10, 100]
    }

# SVM Regression Pipeline
svm_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('svm_reg', SVR()),
    ]
)

# SVM regression
svm_grid_search = GridSearchCV(svm_reg_pipeline, svm_param_grid, cv=10)

# training on the data set
svm_grid_search.fit(X_train, y_train)

# predicting the test data
y_pred_svm = svm_grid_search.predict(X_test)

# calculating rmse for SVM regression model
rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))

# calculating rmsle for SGD regression model
rmsle_svm = np.sqrt(mean_squared_log_error(y_test, y_pred_svm))

print('SVM Regression RMSE: {:.4f}'.format(rmse_svm))
print('SVM Regression RMSLE: {:.4f}'.format(rmsle_svm))

SVM Regression RMSE: 58292.8424
SVM Regression RMSLE: 0.3488


#### Based on the metrics used in above three models, the best model is SVM. So, the final predictions will be used from that model.

In [22]:
# concatenating the Id column and the Predicted SalePrice Column
submission = pd.concat([submission_sample['Id'], pd.DataFrame(y_pred_svm, columns=['SalePrice'])], axis = 1)

# Outputting the result to a csv file for submission
submission.to_csv('submission.csv')