In [12]:
# Import libs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np


In [13]:
# Dataset read
data = pd.read_csv('../data/AmesHousing.csv')

In [14]:
# encoding dataset categorycal to numeric
data_encoded = pd.get_dummies(data, drop_first=True)

In [15]:
# Corr Matrix
correlation_matrix = data_encoded.corr()

# Filter columns related with SalePrice (abs value > 0.5)
saleprice_corr = correlation_matrix['SalePrice'].abs()  # using abs to take positive and negative relation
significant_corr_columns = saleprice_corr[saleprice_corr > 0.5].index

# Columns name with a strong relation (>=0.5)
print("Columns with salesprice strong relation (>|0.5|):")
print(significant_corr_columns)

Columns with salesprice strong relation (>|0.5|):
Index(['Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Full Bath',
       'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'SalePrice',
       'Exter Qual_TA', 'Foundation_PConc', 'Kitchen Qual_TA'],
      dtype='object')


In [16]:
# Dataframe with encode data
features = significant_corr_columns.drop('SalePrice')  # removing 'SalePrice' from features
X = data_encoded[features]  # predict variables
y = data_encoded['SalePrice']  # target variable


In [20]:
# Split dataset training (80%) y test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Size of training dataset: {X_train.shape}')
print(f'Size of test dataset: {X_test.shape}')


Size of training dataset: (2344, 14)
Size of test dataset: (586, 14)


In [18]:
# Init Scaler
scaler = StandardScaler()

# Setting the scaler to both dataset, training and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [19]:
# Saving results
np.save('X_train_scaled.npy', X_train_scaled)
np.save('X_test_scaled.npy', X_test_scaled)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)