In [69]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

In [70]:
# importing training data
training_data = pd.read_csv('train.csv')
training_data['Open_Date_Converted'] = pd.to_datetime(training_data['Open Date'])
training_data['year'] = training_data['Open_Date_Converted'].dt.year 
training_data['month'] = training_data['Open_Date_Converted'].dt.month 
training_data['day'] = training_data['Open_Date_Converted'].dt.day

# display the top 10 data rows
training_data.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P33,P34,P35,P36,P37,revenue,Open_Date_Converted,year,month,day
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,5,5,4,3,4,5653753.0,1999-07-17,1999,7,17
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,0,0,0,0,0,6923131.0,2008-02-14,2008,2,14
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,0,0,0,0,0,2055379.0,2013-03-09,2013,3,9
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,6,18,12,12,6,2675511.0,2012-02-02,2012,2,2
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,2,3,4,3,3,4316715.0,2009-05-09,2009,5,9
5,5,02/12/2010,Ankara,Big Cities,FC,6,6.0,4.5,7.5,8,...,0,0,0,0,0,5017319.0,2010-02-12,2010,2,12
6,6,10/11/2010,İstanbul,Big Cities,IL,2,3.0,4.0,4.0,1,...,2,3,5,4,4,5166635.0,2010-10-11,2010,10,11
7,7,06/21/2011,İstanbul,Big Cities,IL,4,5.0,4.0,5.0,2,...,0,0,0,0,0,4491607.0,2011-06-21,2011,6,21
8,8,08/28/2010,Afyonkarahisar,Other,IL,1,1.0,4.0,4.0,1,...,3,4,5,4,5,4952497.0,2010-08-28,2010,8,28
9,9,11/16/2011,Edirne,Other,IL,6,4.5,6.0,7.5,6,...,0,0,0,0,0,5444227.0,2011-11-16,2011,11,16


In [71]:
# importing test_data
test_data = pd.read_csv('test.csv')
test_data['Open_Date_Converted'] = pd.to_datetime(test_data['Open Date'])
test_data['year'] = test_data['Open_Date_Converted'].dt.year 
test_data['month'] = test_data['Open_Date_Converted'].dt.month 
test_data['day'] = test_data['Open_Date_Converted'].dt.day
test_data.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P32,P33,P34,P35,P36,P37,Open_Date_Converted,year,month,day
0,0,01/22/2011,Niğde,Other,FC,1,4.0,4.0,4.0,1,...,0,0,0,0,0,0,2011-01-22,2011,1,22
1,1,03/18/2011,Konya,Other,IL,3,4.0,4.0,4.0,2,...,0,0,0,0,0,0,2011-03-18,2011,3,18
2,2,10/30/2013,Ankara,Big Cities,FC,3,4.0,4.0,4.0,2,...,0,0,0,0,0,0,2013-10-30,2013,10,30
3,3,05/06/2013,Kocaeli,Other,IL,2,4.0,4.0,4.0,2,...,0,0,0,0,0,0,2013-05-06,2013,5,6
4,4,07/31/2013,Afyonkarahisar,Other,FC,2,4.0,4.0,4.0,1,...,0,0,0,0,0,0,2013-07-31,2013,7,31
5,5,11/29/2007,Eskişehir,Other,FC,4,4.5,5.0,4.0,2,...,0,0,0,0,0,0,2007-11-29,2007,11,29
6,6,11/05/2008,Rize,Other,FC,4,2.0,4.0,4.0,2,...,4,1,2,0,0,0,2008-11-05,2008,11,5
7,7,12/03/2013,Ankara,Big Cities,IL,2,5.0,4.0,5.0,2,...,0,0,3,0,4,4,2013-12-03,2013,12,3
8,8,05/30/2006,İstanbul,Big Cities,IL,5,4.0,4.0,4.0,1,...,0,0,3,5,0,0,2006-05-30,2006,5,30
9,9,01/20/2012,İstanbul,Big Cities,IL,15,7.5,6.0,3.0,4,...,0,2,24,4,0,3,2012-01-20,2012,1,20


In [72]:
# calculating the mean value of training data revenue
mean_pred = training_data['revenue'].mean()

# assigning the mean value as predictions to test data
mean_preds = [mean_pred for _ in range(len(test_data))]

# importing true values of the test data
submission_sample = pd.read_csv('sampleSubmission.csv')
true = submission_sample['Prediction']

### Calculating Baseline Accuracy

In [73]:
# calculate baseline rmse and rmsle accuracies
baseline_rmse_accuracy = np.sqrt(mean_squared_error(mean_preds, true))

print('Mean Baseline  RMSE: {:.4f}'.format(baseline_rmse_accuracy))

Mean Baseline  RMSE: 0.0131


In [79]:
# dividing data into features and target variable
X_train = training_data.drop(columns = ['Id', 'Open Date', 'City', 'Open_Date_Converted', 'revenue'], axis = 1)
y_train = training_data.revenue

# test data features and the target variable
X_test = test_data.drop(columns = ['Id', 'Open Date', 'City', 'Open_Date_Converted'], axis = 1)
y_test = true

### PreProcessing Pipeline

In [81]:
# extracting the numerical and categorical data
num_data = X_train.select_dtypes(exclude="object")
cat_data = X_train.select_dtypes(include="object")

# finding the numerical and categorical features
num_features = num_data.columns.to_list() 
cat_features = cat_data.columns.to_list()

# removing the target variable(SalePrice) from the features
# num_features.remove('revenue')

In [82]:
# Preprocessing pipeline for numerical features
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining the two preprocessing pipelines using ColumnTransfer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

### Linear Regression Pipeline

In [83]:
# steps in the linear regression pipeline (preprocessing and prediction)
steps_linear_reg=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('lin_reg', LinearRegression())
    ]

# defining the pipeline
lin_reg_pipeline = Pipeline(steps_linear_reg)

# training on the data set
lin_reg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['P1', 'P2', 'P3', 'P4', 'P5',
                                                   'P6', 'P7', 'P8', 'P9',
                                                   'P10', 'P11', 'P12', 'P13',
                                                   'P14', 'P15', 'P16', 'P17',
                                                   'P18', 'P19', 'P20', 'P21',
                                                   'P22', 'P23', 'P24', 'P25',
                                                   'P26', 'P27', 'P28', 'P29',
                               

### SGD Regression Pipeline

In [86]:
# SGD Regression Pipeline
sgd_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('sgd_reg', SGDRegressor()),
    ]
)

# training on the data set
sgd_reg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['P1', 'P2', 'P3', 'P4', 'P5',
                                                   'P6', 'P7', 'P8', 'P9',
                                                   'P10', 'P11', 'P12', 'P13',
                                                   'P14', 'P15', 'P16', 'P17',
                                                   'P18', 'P19', 'P20', 'P21',
                                                   'P22', 'P23', 'P24', 'P25',
                                                   'P26', 'P27', 'P28', 'P29',
                               

### SVM Regression Pipeline

In [88]:
# SVM Regression Pipeline
svm_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('skb', SelectKBest(f_regression, k = 25)),
        ('svm_reg', SVR()),
    ]
)

# training on the data set
svm_reg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['P1', 'P2', 'P3', 'P4', 'P5',
                                                   'P6', 'P7', 'P8', 'P9',
                                                   'P10', 'P11', 'P12', 'P13',
                                                   'P14', 'P15', 'P16', 'P17',
                                                   'P18', 'P19', 'P20', 'P21',
                                                   'P22', 'P23', 'P24', 'P25',
                                                   'P26', 'P27', 'P28', 'P29',
                               

## VotingRegressor

In [92]:
# Voting Regression Pipeline
voting_reg = VotingRegressor(
    estimators = [
        ('lin', lin_reg_pipeline),
        ('svm', svm_reg_pipeline),
        ('sgd', sgd_reg_pipeline),
    ],
)

# training on the data set
voting_reg.fit(X_train, y_train)

# # predicting the test data
y_pred_vr = voting_reg.predict(X_test)

# # calculating rmse for SVM regression model
rmse_vr = np.sqrt(mean_squared_error(y_test, y_pred_vr))

print('Voting Regressor RMSE: {:.4f}'.format(rmse_vr))

Voting Regressor RMSE: 1243098.6975


### Prediction Results from Voting Regressor

In [107]:
# concatenating the Id column and the Predicted revenue Column
submission = pd.concat([submission_sample['Id'], pd.DataFrame(y_pred_vr, columns=['Prediction'])], axis = 1)

# Outputting the result to a csv file for submission
submission.to_csv('submission_vr.csv', index = False)

## Random Forest Regressor

In [103]:
# Random Forest Pipeline
rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('rf', RandomForestRegressor()),
    ]
)

# parameters grid for GridSearchCV
param_grid = [
    {
        'rf__n_estimators': [50, 100, 200, 300],
        'rf__max_depth': [2, 3, 5, 10, 20],
    }
]

# Random Forest regression
rf_reg = GridSearchCV(rf_pipeline, param_grid, cv=10)

# training on the data set
rf_reg.fit(X_train, y_train)

# # predicting the test data
y_pred_rf = rf_reg.predict(X_test)

# # calculating rmse for SVM regression model
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print('Random Forest Regressor RMSE: {:.4f}'.format(rmse_rf))

Random Forest Regressor RMSE: 1014335.9112


### Prediction Results from Random Forest Regressor

In [108]:
# concatenating the Id column and the Predicted revenue Column
submission = pd.concat([submission_sample['Id'], pd.DataFrame(y_pred_rf, columns=['Prediction'])], axis = 1)

# Outputting the result to a csv file for submission
submission.to_csv('submission_rf.csv', index = False)

## GBRT Regressor

In [104]:
# Random Forest Pipeline
gb_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('gb', GradientBoostingRegressor()),
    ]
)

# parameters grid for GridSearchCV
param_grid = [
    {
        'gb__n_estimators': [50, 100, 200, 300],
        'gb__max_depth': [2, 3, 5, 10, 20],
    }
]

# Random Forest regression
gb_reg = GridSearchCV(gb_pipeline, param_grid, cv=10)

# training on the data set
gb_reg.fit(X_train, y_train)

# # predicting the test data
y_pred_gb = gb_reg.predict(X_test)

# # calculating rmse for SVM regression model
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))

print('GBRT Regressor RMSE: {:.4f}'.format(rmse_gb))

GBRT Regressor RMSE: 1684977.7325


### Prediction Results from GBRT Regressor

In [106]:
# concatenating the Id column and the Predicted revenue Column
submission = pd.concat([submission_sample['Id'], pd.DataFrame(y_pred_gb, columns=['Prediction'])], axis = 1)

# Outputting the result to a csv file for submission
submission.to_csv('submission_gb.csv', index = False)