In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error

# Load data
df = pd.read_csv('data/housePricing.csv')

In [95]:
# Examine data  
print(df.info())
print(df.describe().T) # Alley, FireplaceQu, PoolQC, Fence, MiscFeature should be dropped ( too much nulls )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Preprocessing

In [96]:
# Handle missing values

# Alley, FireplaceQu, PoolQC, Fence, MiscFeature should be dropped ( too much nulls )
df = df.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

# Fill NaN values with mean for numeric columns
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill NaN values with the most frequent label for object columns
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    most_frequent_label = df[col].mode().iloc[0]
    df[col] = df[col].fillna(most_frequent_label)

In [97]:
# number of unique values for object columns ( to do onehot encoding )
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    num_unique = df[col].nunique()
    print(f"Column: {col}, Number of Unique Values: {num_unique}")

Column: MSZoning, Number of Unique Values: 5
Column: Street, Number of Unique Values: 2
Column: LotShape, Number of Unique Values: 4
Column: LandContour, Number of Unique Values: 4
Column: Utilities, Number of Unique Values: 2
Column: LotConfig, Number of Unique Values: 5
Column: LandSlope, Number of Unique Values: 3
Column: Neighborhood, Number of Unique Values: 25
Column: Condition1, Number of Unique Values: 9
Column: Condition2, Number of Unique Values: 8
Column: BldgType, Number of Unique Values: 5
Column: HouseStyle, Number of Unique Values: 8
Column: RoofStyle, Number of Unique Values: 6
Column: RoofMatl, Number of Unique Values: 8
Column: Exterior1st, Number of Unique Values: 15
Column: Exterior2nd, Number of Unique Values: 16
Column: MasVnrType, Number of Unique Values: 4
Column: ExterQual, Number of Unique Values: 4
Column: ExterCond, Number of Unique Values: 5
Column: Foundation, Number of Unique Values: 6
Column: BsmtQual, Number of Unique Values: 4
Column: BsmtCond, Number 

In [98]:
# One-hot encode object columns
df = pd.get_dummies(df, columns=df.select_dtypes(include='object').columns)

In [99]:
# Feature selection
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

selector = SelectKBest(f_regression, k=50)
X_new = selector.fit_transform(X, y)

In [100]:
# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=101)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

# Training

In [101]:
svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

y_pred_train = svr.predict(X_train)
y_pred_valid = svr.predict(X_valid)
y_pred_test = svr.predict(X_test)


# Evaluation

In [102]:
# Print scores

mse_train = mean_squared_error(y_train, y_pred_train)
mse_valid = mean_squared_error(y_valid, y_pred_valid)
mse_test = mean_squared_error(y_test, y_pred_test)
print('Train MSE:', mse_train)
print('Validation MSE:', mse_valid)
print('Test MSE:', mse_test)

print('Train score:', svr.score(X_train, y_train))
print('Validation score:', svr.score(X_valid, y_valid)) 
print('Test score:', svr.score(X_test, y_test))

Train MSE: 1327773948.606564
Validation MSE: 3369829751.4331703
Test MSE: 1341967138.2586517
Train score: 0.789616066504691
Validation score: 0.4856514221358539
Test score: 0.7746956371240564
