# Model Training Module

Now that we have our cleaned dataset, we can finally start training our models to predict real estate price.

Importing important libraries


In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Import specified linear algorithms
from sklearn.linear_model import ElasticNet, Ridge, Lasso

# Import specified ensemble algorithms 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%matplotlib inline

In [36]:
df = pd.read_csv('training_data.csv')
df = df.drop(columns=['хаяг'])
df.shape

(5330, 34)

# Split Dataset for training and testing.

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
# object for our target variable

y = df['үнэ']

# seperate object for our input features

X = df.drop('үнэ', axis=1)

In [39]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# verify length of each set
len(X_train), len(X_test), len(y_train), len(y_test)

(4264, 1066, 4264, 1066)

# Model Pipeline


In [40]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [41]:
pipelines = {
    'lasso' : make_pipeline(StandardScaler(), Lasso(random_state=123)),
    'ridge' : make_pipeline(StandardScaler(), Ridge(random_state=123)),
    'enet' : make_pipeline(StandardScaler(), ElasticNet(random_state=123)),
    'rf' : make_pipeline(StandardScaler(), RandomForestRegressor(random_state=123)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingRegressor(random_state=123))
}

### Linear Regression

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error = ", mae)
print("Training Accuracy = ", linear.score(X_train, y_train))
print("Test Accuracy     = ", linear.score(X_test, y_test))

Mean Absolute Error =  68.54502674159437
Training Accuracy =  0.7610697757545168
Test Accuracy     =  0.7466088275296585


### Decision Tree Regressor

In [43]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(min_samples_split=2)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error = ", mae)
print("Training Accuracy = ", dt.score(X_train, y_train))
print("Test Accuracy     = ", dt.score(X_test, y_test))

Mean Absolute Error =  54.10540978142589
Training Accuracy =  1.0
Test Accuracy     =  0.748508717361396


### Random Forest Regressor

In [44]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 1000, max_depth=5, random_state = 12)
rf.fit(X_train, y_train);
y_pred = rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error = ", mae)
print("Training Accuracy = ", rf.score(X_train, y_train))
print("Test Accuracy     = ", rf.score(X_test, y_test))

Mean Absolute Error =  55.2154540675011
Training Accuracy =  0.8361042638903161
Test Accuracy     =  0.8058338493907393


### Polynomial Features

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Assuming X_train, y_train, X_test, y_test are already defined

# Define the pipeline and train model
poly_model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                       ('rf', RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=12))])
poly_model.fit(X_train, y_train)

# Predict on test data
y_pred = poly_model.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error =", mae)

# Calculate the Score
print("Training Accuracy =", poly_model.score(X_train, y_train))
print("Test Accuracy     =", poly_model.score(X_test, y_test))

Mean Absolute Error = 53.42224695359294
Training Accuracy = 0.8523215311322855
Test Accuracy     = 0.8225036517570986
