In [None]:
##################################################
# PROG8420 - Programming for Big Data           ##
# Final Project                                 ##
#                                               ##
# Submission Date: 20-Apr-2020                  ##
#                                               ##
# Team Members:                                 ##
#   1. Balamurugan Kalaiarasu (8680119)         ##
#   2. Aayush Vashistha (8681386)               ##
##################################################

# Model Selection

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
# Importing the dataset
dataset = pd.read_csv('final_data_cleaned_dummies.csv', encoding='iso-8859-1')

In [4]:
dataset = reduce_mem_usage(dataset)

Memory usage of dataframe is 160.55 MB --> 24.93 MB (Decreased by 84.5%)


In [5]:
dataset.head()

Unnamed: 0,followers,square,communityAverage,floorNumber,propertyAge,tradeTime_2003,tradeTime_2008,tradeTime_2009,tradeTime_2010,tradeTime_2011,...,district_10,district_11,district_12,district_13,floorType_µ×,floorType_¶¥,floorType_¸ß,floorType_Î´Öª,floorType_ÖÐ,totalPrice
0,106,131.0,56021.0,26.0,11.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,415.0
1,126,132.380005,71539.0,22.0,12.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,575.0
2,48,198.0,48160.0,4.0,11.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1030.0
3,138,134.0,51238.0,21.0,8.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,297.5
4,286,81.0,62588.0,6.0,56.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,392.0


In [6]:
dataset.shape

(318851, 66)

In [7]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [8]:
X

array([[1.0600e+02, 1.3100e+02, 5.6021e+04, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.2600e+02, 1.3238e+02, 7.1539e+04, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [4.8000e+01, 1.9800e+02, 4.8160e+04, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       ...,
       [2.0000e+00, 1.0217e+02, 4.6927e+04, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [4.0000e+00, 1.7834e+02, 5.4842e+04, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 9.2450e+01, 5.4282e+04, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]], dtype=float32)

In [9]:
y

array([ 415.,  575., 1030., ...,  359.,  720.,  450.], dtype=float32)

In [10]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

### Using Pipeline Method

In [11]:
# Importing required libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [12]:
# Creating Pipeline for SVR, Decision Tree Regression, Random Forest Regressison and XG Boost Regression
pipeline_svr = Pipeline([('scale',StandardScaler()), ('svr', SVR(kernel='linear'))])
pipeline_dec = Pipeline([('decision', DecisionTreeRegressor(criterion='mse'))])
pipeline_ran = Pipeline([('random', RandomForestRegressor(n_estimators=100, criterion='mse', n_jobs=-1))])
pipeline_xg = Pipeline([('xgboost', XGBRegressor(n_estimators=100, n_jobs=-1))])

In [13]:
pipelines= [pipeline_svr, pipeline_dec, pipeline_ran, pipeline_xg]

In [14]:
best_score=0.0
best_regressor=0
best_pipeline=""

In [15]:
%%time
# Fitting Training data
# Dictionary of pipelines and regressor types for ease of reference
pipe_dict = {0: 'Support Vector Regression', 1: 'Decision Tree Regression', 2: 'Random Forest Regression', 3: 'XgBoost'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

Wall time: 2h 35min 45s


In [16]:
%%time
# Models Evaluation
best_score=0.0
for i,regressor in enumerate(pipelines):
    score = regressor.score(X_test,y_test)
    print("{} Test Accuracy: {}".format(pipe_dict[i],score))
    if best_score < score:
        best_score = score 
        best_pipeline = regressor
        best_regressor = pipe_dict[i]
print('\n--------------------------------------\n')
print('Best Regressor - ', best_regressor)
print('Best Score - ', best_score)

Support Vector Regression Test Accuracy: 0.7880154354787211
Decision Tree Regression Test Accuracy: 0.8523314480946618
Random Forest Regression Test Accuracy: 0.9269710134397035
XgBoost Test Accuracy: 0.9254096946611579

--------------------------------------

Best Regressor -  Random Forest Regression
Best Score -  0.9269710134397035
Wall time: 8min 41s
