In [340]:
#importing data from kaggel
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv("kalodatahousing.csv", header = 0)
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


###### finding missing values if any

In [346]:
any_missing = df.isnull().values.any()

print(any_missing)

False


##### checking for outliers 

In [349]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

##### Feature Engineering 
##### One hot encoding to change yes and no to 0 and 1 

In [352]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['mainroad', 
                                                     'guestroom', 
                                                     'basement', 
                                                     'hotwaterheating', 
                                                     'airconditioning', 
                                                     'prefarea', 
                                                     'furnishingstatus'], dtype=int)
one_hot_encoded_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0,1,1,0,...,0,1,0,0,1,0,1,1,0,0
1,12250000,8960,4,4,4,3,0,1,1,0,...,0,1,0,0,1,1,0,1,0,0
2,12250000,9960,3,2,2,2,0,1,1,0,...,1,1,0,1,0,0,1,0,1,0
3,12215000,7500,4,2,2,3,0,1,1,0,...,1,1,0,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,0,1,0,1,...,1,1,0,0,1,1,0,1,0,0
5,10850000,7500,3,3,1,2,0,1,1,0,...,1,1,0,0,1,0,1,0,1,0
6,10150000,8580,4,3,4,2,0,1,1,0,...,0,1,0,0,1,0,1,0,1,0
7,10150000,16200,5,3,2,0,0,1,1,0,...,0,1,0,1,0,1,0,0,0,1
8,9870000,8100,4,1,2,2,0,1,0,1,...,1,1,0,0,1,0,1,1,0,0
9,9800000,5750,3,2,4,1,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1


##### Correctly splitting the data into training and testing sets with a clear rationale.

In [355]:
X = one_hot_encoded_data.drop('price', axis=1)
y = one_hot_encoded_data['price']

In [357]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [359]:
print("80% in training "f'X_train : {X_train.shape}')
print("80% in training "f'y_train : {y_train.shape}')
print("20% in testing "f'X_test : {X_test.shape}')
print("20% in testing "f'y_test : {y_test.shape}')

80% in training X_train : (436, 20)
80% in training y_train : (436,)
20% in testing X_test : (109, 20)
20% in testing y_test : (109,)


In [361]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##### Selection of appropriate regression models and justification for choices.

##### Linear Regression Algorithm

In [365]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [367]:
y_pred_lr = lr_model.predict(X_test)

In [386]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
print(f"Mean Squared Error Linear: {mse_lr}")
print(f"R-squared Value Linear: {r2_lr}")


Mean Squared Error Linear: 1754318687330.661
R-squared Value Linear: 0.06049227307260152


In [390]:
y_pred_lr_class = [1 if i >= 0.5 else 0 for i in y_pred_lr]
accuracy_lr = accuracy_score(y_test, y_pred_lr_class)
print(f"Linear Regression Accuracy: {accuracy_lr:.2f}")

Linear Regression Accuracy: 0.00


##### Random Forest Algorithm

In [374]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [376]:
# Predict using the random forest model
y_pred_rf = rf_model.predict(X_test)

In [378]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
print(f"Mean Squared Error RandomF {mse_rf}")
print(f"R-squared Value RandomF: {r2_rf}")

Mean Squared Error RandomF 2827551405963.3027
R-squared Value RandomF: 0.06049227307260152


In [380]:
# Evaluate Random Forest Model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

Random Forest Accuracy: 0.00


##### Efforts to refine and improve the model's performance through iterative tuning.

In [394]:
from sklearn.ensemble import RandomForestRegressor
rfr_model_optimized = RandomForestRegressor(n_estimators=150, random_state=42)
rfr_model_optimized.fit(X_train, y_train)

In [400]:
y_pred_rfr_optimized = rfr_model_optimized.predict(X_test)
mse_rfr_optimized = mean_squared_error(y_test, y_pred_rf_optimized)
rmse_rfr_optimized = np.sqrt(mse_rfr_optimized)
r2_rfr_optimized = r2_score(y_test, y_pred_rfr_optimized)
y_pred_rfr_optimized_class = [1 if i >= 0.5 else 0 for i in y_pred_rfr_optimized]
accuracy_rfr_optimized = accuracy_score(y_test, y_pred_rfr_optimized_class)
print(f"Optimized Random Forest - RMSE: {rmse_rfr_optimized}, R2: {r2_rfr_optimized}")
print(f"Accuracy : {accuracy_rfr_optimized}")

Optimized Random Forest - RMSE: 1402178.930682703, R2: 0.6110241508530889
Accuracy : 0.0


##### SVM 

In [208]:
from sklearn.svm import SVC

# Train an SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predict using the SVM model
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate SVM Model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

SVM Accuracy: 0.02


##### Comparing Linear, RandomForest and SVM

In [211]:
print("Model Comparison:")
print(f"Linear Regression Accuracy: {accuracy_lr:.2f}")
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")
print(f"SVM Accuracy: {accuracy_svm:.2f}")


Model Comparison:
Linear Regression Accuracy: 0.00
Random Forest Accuracy: 0.00
SVM Accuracy: 0.02
