In [1]:
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor

In [5]:
# check missing values
def check_nan(df, k):
    total_na = df.isnull().sum().sort_values(ascending=False)
    percent = ((df.isnull().sum() / df.isnull().count()) * 100).sort_values(ascending=False)
    missing_data = pd.concat([total_na, percent], axis=1, keys=['Total', 'Percent'])
    missing_data = missing_data.reset_index()
    missing_data.columns = ['Name', 'Total', 'Percent']
    print(missing_data[:k])

In [7]:
# concat different aspects into DataFrame
def concat_aspect(df_train, df_test, aspect):
    if aspect == 'schools':
        schools01 = pd.read_csv('data/primary_school_01.csv', sep=',')
        schools01_test = pd.read_csv('data/primary_school_01_test.csv', sep=',')
        schools02 = pd.read_csv('data/primary_school_02.csv', sep=',')
        schools02_test = pd.read_csv('data/primary_school_02_test.csv', sep=',')
        schools03 = pd.read_csv('data/primary_school_03.csv', sep=',')
        schools03_test = pd.read_csv('data/primary_school_03_test.csv', sep=',')

        secschool01 = pd.read_csv('data/sec_school_01.csv', sep=',')
        secschool01_test = pd.read_csv('data/sec_school_01_test.csv', sep=',')
        secschool02 = pd.read_csv('data/sec_school_02.csv', sep=',')
        secschool02_test = pd.read_csv('data/sec_school_02_test.csv', sep=',')
        secschool03 = pd.read_csv('data/sec_school_03.csv', sep=',')
        secschool03_test = pd.read_csv('data/sec_school_03_test.csv', sep=',')
        
        train_data = pd.concat([df_train[df_train.columns.values[:-1]], 
                                schools01, schools02, schools03, 
                                secschool01, secschool02, secschool03,
                                df_train['price']], axis=1)
        test_data = pd.concat([df_test, schools01_test, schools02_test, schools03_test, 
                               secschool01_test, secschool02_test, secschool03_test], axis=1)
        # check_nan(train_data, 5)
        # check_nan(test_data, 5)
        train_data.to_csv('data/train_concat_schools.csv', index = False)
        test_data.to_csv('data/test_concat_schools.csv', index = False)
        
    elif aspect == 'amenities':
        shopping01 = pd.read_csv('data/shopping_01.csv', sep=',')
        shopping01_test = pd.read_csv('data/shopping_01_test.csv', sep=',')
        shopping02 = pd.read_csv('data/shopping_02.csv', sep=',')
        shopping02_test = pd.read_csv('data/shopping_02_test.csv', sep=',')
        shopping03 = pd.read_csv('data/shopping_03.csv', sep=',')
        shopping03_test = pd.read_csv('data/shopping_03_test.csv', sep=',')
        
        hawker_01 = pd.read_csv('data/hawker_01.csv', sep=',')
        hawker_01_test = pd.read_csv('data/hawker_01_test.csv', sep=',')
        hawker_02 = pd.read_csv('data/hawker_02.csv', sep=',')
        hawker_02_test = pd.read_csv('data/hawker_02_test.csv', sep=',')
        hawker_03 = pd.read_csv('data/hawker_03.csv', sep=',')
        hawker_03_test = pd.read_csv('data/hawker_03_test.csv', sep=',')
        
        train_data = pd.concat([df_train[df_train.columns.values[:-1]], 
                                shopping01, shopping02, shopping03, 
                                hawker_01, hawker_02, hawker_03,
                                df_train['price']], axis=1)
        test_data = pd.concat([df_test, shopping01_test, shopping02_test, shopping03_test, 
                               hawker_01_test, hawker_02_test, hawker_03_test], axis=1)
        # check_nan(train_data, 5)
        # check_nan(test_data, 5)
        train_data.to_csv('data/train_concat_amenities.csv', index = False)
        test_data.to_csv('data/test_concat_amenities.csv', index = False)
        
    elif aspect == 'location':
        cc01 = pd.read_csv('data/cc_01.csv', sep=',')
        cc01_test = pd.read_csv('data/cc_01_test.csv', sep=',')
        cc02 = pd.read_csv('data/cc_02.csv', sep=',')
        cc02_test = pd.read_csv('data/cc_02_test.csv', sep=',')
        cc03 = pd.read_csv('data/cc_03.csv', sep=',')
        cc03_test = pd.read_csv('data/cc_03_test.csv', sep=',')
        train_data = pd.concat([df_train[df_train.columns.values[:-1]], 
                                cc01, cc02, cc03, df_train['price']], axis=1)
        test_data = pd.concat([df_test, cc01_test, cc02_test, cc03_test], axis=1)
        # check_nan(train_data, 5)
        # check_nan(test_data, 5)
        train_data.to_csv('data/train_concat_location.csv', index = False)
        test_data.to_csv('data/test_concat_location.csv', index = False)

    elif aspect == 'transportation':
        train_01 = pd.read_csv('data/train_01.csv', sep=',')
        train_01_test = pd.read_csv('data/train_01_test.csv', sep=',')
        train_02 = pd.read_csv('data/train_02.csv', sep=',')
        train_02_test = pd.read_csv('data/train_02_test.csv', sep=',')
        train_03 = pd.read_csv('data/train_03.csv', sep=',')
        train_03_test = pd.read_csv('data/train_03_test.csv', sep=',')
        train_data = pd.concat([df_train[df_train.columns.values[:-1]], 
                                train_01, train_02, train_03, df_train['price']], axis=1)
        test_data = pd.concat([df_test, train_01_test, train_02_test, train_03_test], axis=1)
        # check_nan(train_data, 5)
        # check_nan(test_data, 5)
        train_data.to_csv('data/train_concat_transportation.csv', index = False)
        test_data.to_csv('data/test_concat_transportation.csv', index = False)
    else:
        print('Invalid InputError: please check the aspect name.')

In [8]:
df_train = pd.read_csv('data/train_data_cleaned.csv', sep=',')
df_test = pd.read_csv('data/test_data_cleaned.csv', sep=',')
df_train.shape, df_test.shape

((25714, 16), (7500, 15))

In [9]:
concat_aspect(df_train, df_test, 'schools')
concat_aspect(df_train, df_test, 'amenities')
concat_aspect(df_train, df_test, 'location')
concat_aspect(df_train, df_test, 'transportation')

In [10]:
# read train and test data
def read_data(aspect):
    if aspect in ['basic','schools','amenities','location','transportation']:
        # concat different aspects
        if aspect == 'basic':
            train_data = TabularDataset('data/train_data_cleaned.csv')
            test_data = TabularDataset('data/test_data_cleaned.csv')
        else:
            train_data = TabularDataset('data/train_concat_' + aspect + '.csv')
            test_data = TabularDataset('data/test_concat_'+ aspect +'.csv')
        return train_data, test_data
    else:
        print('Invalid InputError: please check the aspect name.')
        return 

In [32]:
# fit and predict
aspect = 'basic' # 'basic', 'schools', 'amenities','location','transportation'
log_tf = False # True or False, if True, use RMSLE for evaluation, else RMSE

train_data, test_data = read_data(aspect)
if aspect == 'schools': # keep primary_school_0to1, sec_school_0to3
    train_data = train_data.drop(columns=['primary_school_0to2', 'primary_school_0to3', 'sec_school_0to1', 'sec_school_0to2'])
    test_data = test_data.drop(columns=['primary_school_0to2', 'primary_school_0to3', 'sec_school_0to1', 'sec_school_0to2'])
elif aspect == 'amenities': # keep shopping_0to2, hawker_0to3
    train_data = train_data.drop(columns=['shopping_0to1', 'shopping_0to3', 'hawker_0to1', 'hawker_0to2'])
    test_data = test_data.drop(columns=['shopping_0to1', 'shopping_0to3', 'hawker_0to1', 'hawker_0to2'])
elif aspect == 'location': # keep cc_0to3
    train_data = train_data.drop(columns=['cc_0to1', 'cc_0to2'])
    test_data = test_data.drop(columns=['cc_0to1', 'cc_0to2'])
elif aspect == 'transportation': # keep train_0to3
    train_data = train_data.drop(columns=['train_0to1', 'train_0to2'])
    test_data = test_data.drop(columns=['train_0to1', 'train_0to2'])

# incorporate new feature
train_data['bed_bath_ratio'] = train_data['bedrooms']/train_data['bathrooms']
test_data['bed_bath_ratio'] = test_data['bedrooms']/test_data['bathrooms']

print(train_data.shape, test_data.shape)
label = 'price'
if log_tf:
    train_data['price'] = np.log(train_data['price'])
    predictor = TabularPredictor(label=label).fit(train_data)
else:
    predictor = TabularPredictor(label=label).fit(train_data)
    
preds = predictor.predict(test_data)
df_id = pd.DataFrame([id for id in range(test_data.shape[0])], columns=['Id'])
if log_tf:
    submission = pd.DataFrame({'Id': df_id['Id'], 'Predicted': np.exp(preds)})
    submission.to_csv('results/submission_autogluon_'+aspect+'_newf_log.csv', index = False)
else:
    submission = pd.DataFrame({'Id': df_id['Id'], 'Predicted': preds})
    submission.to_csv('results/submission_autogluon_'+aspect+'_newf.csv', index = False)

Loaded data from: data/train_data_cleaned.csv | Columns = 16 / 16 | Rows = 25714 -> 25714
Loaded data from: data/test_data_cleaned.csv | Columns = 15 / 15 | Rows = 7500 -> 7500
No path specified. Models will be saved in: "AutogluonModels/ag-20220407_104203/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220407_104203/"
AutoGluon Version:  0.3.1
Train Data Rows:    25714
Train Data Columns: 16
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (74800000.0, 556600.0, 2992606.73952, 4255803.28904)
	If 'regression' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:           

(25714, 17) (7500, 16)


	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
		Fitting TextSpecialFeatureGenerator...
			Fitting BinnedFeatureGenerator...
			Fitting DropDuplicatesFeatureGenerator...
		Fitting TextNgramFeatureGenerator...
			Fitting CountVectorizer for text features: ['street']
			CountVectorizer fit with vocabulary size = 96
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])        : 8 | ['bedrooms', 'bathrooms', 'lat', 'lng', 'built_year', ...]
		('int', [])          : 2 | ['district', 'tenure']
		('object', [])       : 5 | ['name', 'model', 'region', 'planning_area', 'subszone']
		('object', ['text']) : 1 | ['street']
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])                    :  5 | ['name', 'model', 'region', 'planning_area', 'subszone']
		('category', ['text_

[1000]	train_set's rmse: 509018	valid_set's rmse: 891219
[2000]	train_set's rmse: 407114	valid_set's rmse: 843994
[3000]	train_set's rmse: 346240	valid_set's rmse: 808886
[4000]	train_set's rmse: 315416	valid_set's rmse: 792994
[5000]	train_set's rmse: 293219	valid_set's rmse: 776120
[6000]	train_set's rmse: 275580	valid_set's rmse: 761878
[7000]	train_set's rmse: 262513	valid_set's rmse: 753790
[8000]	train_set's rmse: 251586	valid_set's rmse: 746145
[9000]	train_set's rmse: 242583	valid_set's rmse: 738841
[10000]	train_set's rmse: 235352	valid_set's rmse: 734022


	-733945.5397	 = Validation score   (root_mean_squared_error)
	14.86s	 = Training   runtime
	0.81s	 = Validation runtime
Fitting model: LightGBM ...
	-744021.2754	 = Validation score   (root_mean_squared_error)
	1.69s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-851406.1279	 = Validation score   (root_mean_squared_error)
	12.94s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost ...
	-612777.254	 = Validation score   (root_mean_squared_error)
	23.59s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-848284.0469	 = Validation score   (root_mean_squared_error)
	8.57s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-738020.1363	 = Validation score   (root_mean_squared_error)
	30.38s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: XGBoost ...
	-935253.1804	 = Validation score   (root_mean_squared_error)
	1.4s	 = Training   runtime
	