In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load your datasets
train_set = pd.read_csv('./Datasets/cleaned_train.csv')
test_set = pd.read_csv('./Datasets/cleaned_test.csv')

# Drop the specified columns
columns_to_drop = ['town', 'region', 'mrt_nearest', 'mall_nearest', 'mrt_planned_nearest']
X_train = train_set.drop(columns_to_drop, axis=1)
X_test = test_set.drop(columns_to_drop, axis=1)

# Identify the remaining categorical columns
remaining_categorical_columns = ['flat_model', 'subzone', 'planning_area']

# Apply one-hot encoding to 'flat_model' and 'planning_area'
one_hot_columns = ['flat_model', 'planning_area']

# Reinitialize the encoders
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
label_encoder = LabelEncoder()

# Apply one-hot encoding to 'flat_model' and 'planning_area'
X_train_one_hot = encoder.fit_transform(X_train[one_hot_columns])
X_test_one_hot = encoder.transform(X_test[one_hot_columns])
X_train_one_hot = pd.DataFrame(X_train_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))
X_test_one_hot = pd.DataFrame(X_test_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))

# Apply label encoding to 'subzone'
X_train['subzone'] = label_encoder.fit_transform(X_train['subzone'])
X_test['subzone'] = label_encoder.transform(X_test['subzone'])

# Combine the processed data
X_train = X_train.drop(one_hot_columns, axis=1)
X_test = X_test.drop(one_hot_columns, axis=1)
X_train = pd.concat([X_train, X_train_one_hot], axis=1)
X_test = pd.concat([X_test, X_test_one_hot], axis=1)

# Align the training and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

X_train.shape, X_test.shape

((60000, 70), (30000, 70))

In [11]:
X_train.to_csv('./Datasets/train_set_v1.csv', index=False)
X_test.to_csv('./Datasets/test_set_v1.csv', index=False)