# Task 1

In [142]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [143]:
data = pd.read_csv('heart.csv')

In [144]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [145]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [146]:
print(data.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [147]:
X = data.drop('target', axis=1)
y = data['target']


scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

results = []


# L1 regularization

model_l1 = LogisticRegression(penalty='l1', solver='liblinear')

model_l1.fit(X_train, y_train)
train_acc_l1 = accuracy_score(y_train, model_l1.predict(X_train))
test_acc_l1 = accuracy_score(y_test, model_l1.predict(X_test))

results.append({'Penalty': 'L1', 'Training Accuracy': train_acc_l1, 'Testing Accuracy': test_acc_l1})


# L2 regularization

model_l2 = LogisticRegression(penalty='l2', solver='lbfgs')

model_l2.fit(X_train, y_train)
train_acc_l2 = accuracy_score(y_train, model_l2.predict(X_train))
test_acc_l2 = accuracy_score(y_test, model_l2.predict(X_test))

results.append({'Penalty': 'L2', 'Training Accuracy': train_acc_l2, 'Testing Accuracy': test_acc_l2})


# Elastic Net regularization

model_elasticnet = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

model_elasticnet.fit(X_train, y_train)
train_acc_elasticnet = accuracy_score(y_train, model_elasticnet.predict(X_train))
test_acc_elasticnet = accuracy_score(y_test, model_elasticnet.predict(X_test))
results.append({'Penalty': 'Elastic Net', 'Training Accuracy': train_acc_elasticnet, 'Testing Accuracy': test_acc_elasticnet})


In [148]:
results_df = pd.DataFrame(results)
print("\nAll Training and Testing Accuracies:")
print(results_df)


All Training and Testing Accuracies:
       Penalty  Training Accuracy  Testing Accuracy
0           L1           0.871951          0.795122
1           L2           0.871951          0.795122
2  Elastic Net           0.871951          0.795122


In [149]:
# Erors:
# Not all solvers support all penalties. e.g For L1: liblinear works well, while lbfgs does not.
# For Elastic Net: saga is required and l1_ratio must also be specified.

# Parameter Changed:
# For Elastic Net, the l1_ratio can be tuned as it handles the balance between L1 and L2 regularization.


# Task 2

In [150]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = load_iris()

X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
results = []

for s in solvers:
    model = LogisticRegression(solver=s)
    model.fit(X_train, y_train)
    
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    results.append({'Solver': s, 'Training Accuracy': train_accuracy, 'Testing Accuracy': test_accuracy})
    
results_df = pd.DataFrame(results)
print(results_df)


            Solver  Training Accuracy  Testing Accuracy
0            lbfgs           0.975000          1.000000
1        liblinear           0.958333          1.000000
2        newton-cg           0.975000          1.000000
3  newton-cholesky           0.950000          0.966667
4              sag           0.983333          1.000000
5             saga           0.975000          1.000000




In [151]:
# Effect of solver:

# The training and testing accuracy vary depending on the solver used.
# liblinear is better for smaller datasets, lbfgs and newton-cg perform well with medium sized datasets, while saga and sag are designed for large datasets.
# sag is the best in my case, giving the highest training and testing accuracy.



In [152]:
# Testing on heart dataset

from sklearn.preprocessing import StandardScaler

data = pd.read_csv('heart.csv')
X = data.drop('target', axis=1)
y = data['target']


scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

results_heart = []

for s in solvers:
    
    model = LogisticRegression(solver=s, penalty='l2', max_iter=1000)
    model.fit(X_train, y_train)

    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    results_heart.append({'Solver': s, 'Training Accuracy': train_accuracy, 'Testing Accuracy': test_accuracy})


results_heart_df = pd.DataFrame(results_heart)
print(results_heart_df)



            Solver  Training Accuracy  Testing Accuracy
0            lbfgs           0.871951          0.795122
1        liblinear           0.871951          0.795122
2        newton-cg           0.871951          0.795122
3  newton-cholesky           0.871951          0.795122
4              sag           0.871951          0.795122
5             saga           0.871951          0.795122


In [153]:
# Effect on heart dataset

# All solvers give the same training and testing accuracy on heart dataset so yes it is affected by the size of datasets


# Task 3

In [154]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Model
lr_model = LogisticRegression(solver='lbfgs')
lr_model.fit(X_train, y_train)


lr_train_acc = accuracy_score(y_train, lr_model.predict(X_train))
lr_test_acc = accuracy_score(y_test, lr_model.predict(X_test))
print("Logistic Regression Result:")
print(f"Training Accuracy: {lr_train_acc:}")
print(f"Testing Accuracy: {lr_test_acc:}")

# Perceptron Model
perc_model = Perceptron(max_iter=1000, random_state=42)
perc_model.fit(X_train, y_train)

perc_train_acc = accuracy_score(y_train, perc_model.predict(X_train))
perc_test_acc = accuracy_score(y_test, perc_model.predict(X_test))
print("\nPerceptron Result:")
print(f"Training Accuracy: {perc_train_acc:}")
print(f"Testing Accuracy: {perc_test_acc:}")




Logistic Regression Result:
Training Accuracy: 0.975
Testing Accuracy: 1.0

Perceptron Result:
Training Accuracy: 0.675
Testing Accuracy: 0.6333333333333333


In [155]:
# Difference between Perceptron and Logistic Regression

# Logistic Regressio performs better on multiclass classification problems like this Iris dataset,
# Perceptron is a linear classifier so it doesnt perform well with data that isn't linearly separable.
# This results in Perceptron giving lower training and testing accuracy on this dataset.

# Task 4

In [156]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [157]:
data = pd.read_csv('fraud_detection.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Profession          10000 non-null  object
 1   Income              10000 non-null  int64 
 2   Credit_card_number  10000 non-null  int64 
 3   Expiry              10000 non-null  object
 4   Security_code       10000 non-null  int64 
 5   Fraud               10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 468.9+ KB


In [158]:
data.head()

Unnamed: 0,Profession,Income,Credit_card_number,Expiry,Security_code,Fraud
0,DOCTOR,42509,3515418493460774,07/25,251,1
1,DOCTOR,80334,213134223583196,05/32,858,1
2,LAWYER,91552,4869615013764888,03/30,755,1
3,LAWYER,43623,341063356109385,01/29,160,1
4,DOCTOR,22962,4707418777543978402,11/30,102,0


In [159]:
data.describe()

Unnamed: 0,Income,Credit_card_number,Security_code,Fraud
count,10000.0,10000.0,10000.0,10000.0
mean,49761.206,3.851363e+17,863.5878,0.5016
std,28837.72928,1.25795e+18,1484.424959,0.500022
min,1.0,60402960000.0,0.0,0.0
25%,24863.75,180013700000000.0,275.0,0.0
50%,49483.0,3512440000000000.0,539.5,1.0
75%,74483.0,4594779000000000.0,813.25,1.0
max,99986.0,4.999697e+18,9990.0,1.0


In [160]:
data = data.drop(['Credit_card_number', 'Expiry'], axis=1)

label_encoder = LabelEncoder()
data['Profession'] = label_encoder.fit_transform(data['Profession'])

X = data.drop('Fraud', axis=1)
y = data['Fraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [161]:

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1])) # Hidden Layer 1
model.add(Dense(64, activation='tanh')) # Hidden Layer 2
model.add(Dense(32, activation='relu')) # Hidden Layer 3
model.add(Dense(1, activation='sigmoid')) # Output Layer (Binary)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)


print("\nModel Evaluation on Test Data:")
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


y_pred = (model.predict(X_test) > 0.5).astype("int32")

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4995 - loss: 0.6971 - val_accuracy: 0.4844 - val_loss: 0.6944
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - accuracy: 0.5089 - loss: 0.6931 - val_accuracy: 0.5075 - val_loss: 0.6942
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 825us/step - accuracy: 0.5131 - loss: 0.6928 - val_accuracy: 0.4894 - val_loss: 0.6958
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5197 - loss: 0.6925 - val_accuracy: 0.5200 - val_loss: 0.6939
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5156 - loss: 0.6930 - val_accuracy: 0.5125 - val_loss: 0.6940
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step - accuracy: 0.5189 - loss: 0.6915 - val_accuracy: 0.5006 - val_loss: 0.6940
Epoch 7/50
[1m200/200

# Task 5

## Dataset 1

In [162]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [163]:
wine_data = pd.read_csv("WineQT.csv")

wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


In [164]:
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [165]:
wine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043,804.969379
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824,463.997116
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0,411.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0,794.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0,1209.5
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0,1597.0


In [166]:
wine_data = wine_data.drop(columns=["Id"]) 
wine_features = wine_data.drop(columns=["quality"])
wine_target = wine_data["quality"]


In [167]:
# One hot encoding for multi class classification
wine_target = to_categorical(wine_target)

In [168]:
scaler = StandardScaler()
wine_features = scaler.fit_transform(wine_features)

In [169]:
X_train, X_test, y_train, y_test = train_test_split(wine_features, wine_target, test_size=0.2, random_state=42)


In [170]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))  # Hidden Layer 1
model.add(Dense(64, activation='tanh'))  # Hidden Layer 2
model.add(Dense(32, activation='relu'))  # Hidden Layer 3
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output Layer (Multi-class)


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

print("\nModel Evaluation on Test Data:")
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

y_pred_probs = model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

accuracy = accuracy_score(y_test_classes, y_pred)
precision = precision_score(y_test_classes, y_pred, average='weighted')
recall = recall_score(y_test_classes, y_pred, average='weighted')
f1 = f1_score(y_test_classes, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3313 - loss: 1.9119 - val_accuracy: 0.5574 - val_loss: 1.2115
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5530 - loss: 1.1885 - val_accuracy: 0.5956 - val_loss: 1.0197
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5894 - loss: 1.1010 - val_accuracy: 0.5410 - val_loss: 1.0147
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6501 - loss: 0.9353 - val_accuracy: 0.5738 - val_loss: 0.9923
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6086 - loss: 0.9503 - val_accuracy: 0.5738 - val_loss: 0.9841
Epoch 6/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6481 - loss: 0.9212 - val_accuracy: 0.5464 - val_loss: 0.9885
Epoch 7/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Dataset 2

In [171]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [172]:
data = pd.read_csv("data.csv")

In [173]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [174]:
data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [175]:
data.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522,1970.786304,808.608261
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228,29.731848,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [176]:
data = data.drop(columns=["date", "street", "city", "statezip", "country"])

data_features = data.drop(columns=["price"])
data_target = data["price"]

In [177]:
scaler = StandardScaler()
data_features = scaler.fit_transform(data_features)


In [178]:
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, test_size=0.2, random_state=42)


In [179]:

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))  # Hidden Layer 1
model.add(Dense(64, activation='tanh'))  # Hidden Layer 2
model.add(Dense(32, activation='relu'))  # Hidden Layer 3
model.add(Dense(1, activation='linear'))  # Output Layer for Regression

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

print("\nModel Evaluation on Test Data:")
test_loss, test_mse = model.evaluate(X_test, y_test)
print(f"Test Loss (MSE): {test_loss}")

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R2 Score: {r2}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 446169088000.0000 - mean_squared_error: 446169088000.0000 - val_loss: 401454661632.0000 - val_mean_squared_error: 401454661632.0000
Epoch 2/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 891us/step - loss: 428027805696.0000 - mean_squared_error: 428027805696.0000 - val_loss: 401345413120.0000 - val_mean_squared_error: 401345413120.0000
Epoch 3/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 450973171712.0000 - mean_squared_error: 450973171712.0000 - val_loss: 401185079296.0000 - val_mean_squared_error: 401185079296.0000
Epoch 4/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 968us/step - loss: 449342439424.0000 - mean_squared_error: 449342439424.0000 - val_loss: 400971038720.0000 - val_mean_squared_error: 400971038720.0000
Epoch 5/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 914us/step - loss: 466372034560.00