1. Implement logistic regression algorithm from scratch (preferably in python) and test your code on a tiny
dataset of your choice. You might want to break down the full algorithm into small functions.

In [2]:
from math import exp
# Make a prediction with coefficients
def predict(row, w):
  yhat = w[-1] # bias is at dimnesion 0
  for i in range(len(row)-1):
    yhat += w[i] * row[i]
  return 1.0 / (1.0 + exp(-yhat))

# test step 1
dataset = [[2.7810836,2.550537003,0], [1.465489372,2.362125076,0],
 [3.396561688,4.400293529,0], [1.38807019,1.850220317,0],
 [3.06407232,3.005305973,0], [7.627531214,2.759262235,1],
 [5.332441248,2.088626775,1], [6.922596716,1.77106367,1],
 [8.675418651,-0.242068655,1], [7.673756466,3.508563011,1]
 ]

weights = [0.6067, -0.8446,-0.2826] # these are final weights
for row in dataset:
  yhat = predict(row, weights)
  print("Expected=%.3f, Predicted=%.3f [%d]"% (row[-1], yhat, round(yhat)))

Expected=0.000, Predicted=0.321 [0]
Expected=0.000, Predicted=0.200 [0]
Expected=0.000, Predicted=0.126 [0]
Expected=0.000, Predicted=0.268 [0]
Expected=0.000, Predicted=0.276 [0]
Expected=1.000, Predicted=0.882 [1]
Expected=1.000, Predicted=0.766 [1]
Expected=1.000, Predicted=0.918 [1]
Expected=1.000, Predicted=0.994 [1]
Expected=1.000, Predicted=0.804 [1]


In [3]:
# Estimate logistic regression coefficients
#using stochastic gradient descent
def weights_sgd(train, l_rate, n_epoch):
  w = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row, w)
      error = row[-1] - yhat
      sum_error += error**2
      w[-1] = w[-1] + l_rate * \
      error * yhat * (1.0 - yhat)
      for i in range(len(row)-1):
        w[i] = w[i] + l_rate * error * \
        yhat * (1.0 - yhat) * row[i]
    print('>epoch=%d, lrate=%.3f, error=%.3f'% (epoch, l_rate, sum_error))
  return w

l_rate = 0.03
n_epoch = 50
w = weights_sgd(dataset, l_rate, n_epoch)
print(w)

for row in dataset:
  yhat = predict(row, w)
  print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))

>epoch=0, lrate=0.030, error=2.371
>epoch=1, lrate=0.030, error=2.083
>epoch=2, lrate=0.030, error=1.948
>epoch=3, lrate=0.030, error=1.841
>epoch=4, lrate=0.030, error=1.740
>epoch=5, lrate=0.030, error=1.646
>epoch=6, lrate=0.030, error=1.559
>epoch=7, lrate=0.030, error=1.479
>epoch=8, lrate=0.030, error=1.405
>epoch=9, lrate=0.030, error=1.338
>epoch=10, lrate=0.030, error=1.276
>epoch=11, lrate=0.030, error=1.219
>epoch=12, lrate=0.030, error=1.166
>epoch=13, lrate=0.030, error=1.118
>epoch=14, lrate=0.030, error=1.073
>epoch=15, lrate=0.030, error=1.031
>epoch=16, lrate=0.030, error=0.993
>epoch=17, lrate=0.030, error=0.956
>epoch=18, lrate=0.030, error=0.923
>epoch=19, lrate=0.030, error=0.891
>epoch=20, lrate=0.030, error=0.862
>epoch=21, lrate=0.030, error=0.834
>epoch=22, lrate=0.030, error=0.808
>epoch=23, lrate=0.030, error=0.783
>epoch=24, lrate=0.030, error=0.760
>epoch=25, lrate=0.030, error=0.738
>epoch=26, lrate=0.030, error=0.718
>epoch=27, lrate=0.030, error=0.698
>e

In [4]:
# Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  return minmax

def normalize_dataset(dataset, minmax):
 for row in dataset:
  for i in range(len(row)):
    row[i] = (row[i] - minmax[i][0]) \
    / (minmax[i][1] - minmax[i][0])

def cross_validation_split(dataset, n_folds):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset) / n_folds)
  for i in range(n_folds):
    fold = list()
    while len(fold) < fold_size:
      index = random.randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct += 1
  return correct / float(len(actual)) * 100.0

# Calculate root mean squared error
def rmse_metric(actual, predicted):
  sum_error = 0.0
  for i in range(len(actual)):
    prediction_error = predicted[i] - actual[i]
    sum_error += (prediction_error ** 2)
  mean_error = sum_error / float(len(actual))
  return sqrt(mean_error)

def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores

def logistic_regression(train, test, l_rate, n_epoch):
 predictions = list()
 w = weights_sgd(train, l_rate, n_epoch)
 for row in test:
  yhat = predict(row, w)
  yhat = round(yhat)
  predictions.append(yhat)
 return(predictions)



In [5]:
import random
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 50
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%'% (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.100, error=2.009
>epoch=1, lrate=0.100, error=1.984
>epoch=2, lrate=0.100, error=1.960
>epoch=3, lrate=0.100, error=1.936
>epoch=4, lrate=0.100, error=1.913
>epoch=5, lrate=0.100, error=1.890
>epoch=6, lrate=0.100, error=1.868
>epoch=7, lrate=0.100, error=1.846
>epoch=8, lrate=0.100, error=1.824
>epoch=9, lrate=0.100, error=1.803
>epoch=10, lrate=0.100, error=1.782
>epoch=11, lrate=0.100, error=1.762
>epoch=12, lrate=0.100, error=1.742
>epoch=13, lrate=0.100, error=1.723
>epoch=14, lrate=0.100, error=1.703
>epoch=15, lrate=0.100, error=1.684
>epoch=16, lrate=0.100, error=1.666
>epoch=17, lrate=0.100, error=1.648
>epoch=18, lrate=0.100, error=1.630
>epoch=19, lrate=0.100, error=1.612
>epoch=20, lrate=0.100, error=1.595
>epoch=21, lrate=0.100, error=1.578
>epoch=22, lrate=0.100, error=1.561
>epoch=23, lrate=0.100, error=1.545
>epoch=24, lrate=0.100, error=1.528
>epoch=25, lrate=0.100, error=1.513
>epoch=26, lrate=0.100, error=1.497
>epoch=27, lrate=0.100, error=1.482
>e

2. Implement linear regression algorithm from scratch (preferably in python) and test your code on a tiny dataset
of your choice. You might want to break down the full algorithm into small functions.

In [6]:
from math import exp
# Make a prediction with coefficients
def predict(row, w):
  yhat = w[-1] # bias is at dimnesion 0
  for i in range(len(row)-1):
    yhat += w[i] * row[i]
  return yhat

In [7]:
# test step 1
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
weights = [0.84, 0.24] # these are final weights
for row in dataset:
 yhat = predict(row, weights)
 print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))

Expected=1.000, Predicted=1.080
Expected=3.000, Predicted=1.920
Expected=3.000, Predicted=3.600
Expected=2.000, Predicted=2.760
Expected=5.000, Predicted=4.440


In [8]:
 # Estimate linear regression coefficients using stochastic gradient descent
def weights_sgd(train, l_rate, n_epoch):
  w = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row, w)
      error = row[-1]- yhat
      sum_error += error**2
      w[-1] = w[-1] + l_rate * error
      for i in range(len(row)-1):
        w[i] = w[i] + l_rate * error * row[i]
    print('>epoch=%d, lrate=%.3f, error=%.3f'% (epoch, l_rate, sum_error))
  return w

In [9]:
l_rate = 0.001
n_epoch = 50
w = weights_sgd(dataset, l_rate, n_epoch)
print(w)
for row in dataset:
  yhat = predict(row, w)
  print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))

>epoch=0, lrate=0.001, error=46.236
>epoch=1, lrate=0.001, error=41.305
>epoch=2, lrate=0.001, error=36.930
>epoch=3, lrate=0.001, error=33.047
>epoch=4, lrate=0.001, error=29.601
>epoch=5, lrate=0.001, error=26.543
>epoch=6, lrate=0.001, error=23.830
>epoch=7, lrate=0.001, error=21.422
>epoch=8, lrate=0.001, error=19.285
>epoch=9, lrate=0.001, error=17.389
>epoch=10, lrate=0.001, error=15.706
>epoch=11, lrate=0.001, error=14.213
>epoch=12, lrate=0.001, error=12.888
>epoch=13, lrate=0.001, error=11.712
>epoch=14, lrate=0.001, error=10.668
>epoch=15, lrate=0.001, error=9.742
>epoch=16, lrate=0.001, error=8.921
>epoch=17, lrate=0.001, error=8.191
>epoch=18, lrate=0.001, error=7.544
>epoch=19, lrate=0.001, error=6.970
>epoch=20, lrate=0.001, error=6.461
>epoch=21, lrate=0.001, error=6.009
>epoch=22, lrate=0.001, error=5.607
>epoch=23, lrate=0.001, error=5.251
>epoch=24, lrate=0.001, error=4.935
>epoch=25, lrate=0.001, error=4.655
>epoch=26, lrate=0.001, error=4.406
>epoch=27, lrate=0.001,

In [10]:
from random import seed
from math import sqrt
# linear Regression Algorithm With Stochastic Gradient Descent
def linear_regression(train, test, l_rate, n_epoch):
  predictions = list()
  w = weights_sgd(train, l_rate, n_epoch)
  for row in test:
    yhat = predict(row, w)
    predictions.append(yhat)
  return(predictions)




In [11]:
seed(1)
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.001
n_epoch = 50
scores = evaluate_algorithm(dataset, linear_regression,
 n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean RMSE: %.3f'
 % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.001, error=1.310
>epoch=1, lrate=0.001, error=1.299
>epoch=2, lrate=0.001, error=1.289
>epoch=3, lrate=0.001, error=1.278
>epoch=4, lrate=0.001, error=1.268
>epoch=5, lrate=0.001, error=1.258
>epoch=6, lrate=0.001, error=1.248
>epoch=7, lrate=0.001, error=1.238
>epoch=8, lrate=0.001, error=1.228
>epoch=9, lrate=0.001, error=1.219
>epoch=10, lrate=0.001, error=1.209
>epoch=11, lrate=0.001, error=1.200
>epoch=12, lrate=0.001, error=1.190
>epoch=13, lrate=0.001, error=1.181
>epoch=14, lrate=0.001, error=1.172
>epoch=15, lrate=0.001, error=1.163
>epoch=16, lrate=0.001, error=1.154
>epoch=17, lrate=0.001, error=1.145
>epoch=18, lrate=0.001, error=1.136
>epoch=19, lrate=0.001, error=1.127
>epoch=20, lrate=0.001, error=1.119
>epoch=21, lrate=0.001, error=1.110
>epoch=22, lrate=0.001, error=1.102
>epoch=23, lrate=0.001, error=1.093
>epoch=24, lrate=0.001, error=1.085
>epoch=25, lrate=0.001, error=1.077
>epoch=26, lrate=0.001, error=1.069
>epoch=27, lrate=0.001, error=1.061
>e

3. Please see the house prices dataset (https://www.kaggle.com/competitions/house-prices-advanced-
regression-techniques/data) on Kaggle. Use the appropriate algorithm (from question 1 or 2) to learn a model
from the training set and predict the prices for the test set.
  1. Report your average error in the prediction.


I have decided to use Linear regression to predict the prices for the test set. Logistic regression is used when the goal is to estimate the probability of a binary outcome , while linear regression is used when the goal is to predict a continuous, numerical value, which is similiar to our case.

#

In [12]:
import numpy as np
import pandas as pd


In [13]:
#load test and train data
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
train.head()

FileNotFoundError: ignored

In [None]:
# Checking Null values
train.isnull().sum()*100/train.shape[0]

In [None]:
varlist =  ['CentralAir']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
train[varlist] = train[varlist].apply(binary_map)

#Overall Quality is very highliy correlated
corr_y = train.corr()
corr_y['SalePrice'].sort_values(ascending=False).abs()[1:]


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

#select continous features only
num_columns = [col for col in train.columns if train[col].dtype == 'int64']

corr = train[num_columns].corr().abs()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

plt.figure(figsize=(15,8))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
plt.show()

In [None]:
#We are now going to get rid of features that have no impact on overall house sold price
num_col2 = [x for x in num_columns if x!='Id' and x!='MoSold' ]
num_col2 , len(num_col2)


In [None]:
for i in range(0, len(train.columns), 5):
        sns.pairplot(data=train,
                    x_vars=train.columns[i:i+5],
                    y_vars=['SalePrice'])

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have already loaded your data into a DataFrame 'df_train' and defined 'num_col2'

# Separate the features (x_train) and target variable (y_train)
train = train[num_col2]
x_train = train.iloc[:, :-1].values  # Features
y_train = np.log1p(train.iloc[:, -1:].values)  # Target variable

# Standardize (scale) the features manually
mean = x_train.mean(axis=0)
std_dev = x_train.std(axis=0)
x_train = (x_train - mean) / std_dev

# Linear Regression
def linear_regression(x_train, y_train):
    n = x_train.shape[0]
    k = x_train.shape[1]

    # Initialize coefficients and intercept
    beta = np.zeros((k, 1))
    intercept = 0

    # Fit the model (Manually implement linear regression)
    x_train_with_intercept = np.hstack((np.ones((n, 1)), x_train))
    y_pred = np.dot(x_train_with_intercept, np.vstack((intercept, beta)))

    # Calculate RMSE (Root Mean Squared Error)
    rmse = np.sqrt(np.mean((y_pred - y_train) ** 2))

    return rmse

# Cross-validation
def cross_validation(x_train, y_train, cv=4):
    n = x_train.shape[0]
    fold_size = n // cv
    rmses = []

    for i in range(cv):
        start = i * fold_size
        end = (i + 1) * fold_size

        x_val = x_train[start:end, :]
        y_val = y_train[start:end, :]

        x_train_fold = np.vstack((x_train[:start, :], x_train[end:, :]))
        y_train_fold = np.vstack((y_train[:start, :], y_train[end:, :]))

        rmse = linear_regression(x_train_fold, y_train_fold)
        rmses.append(rmse)

    return rmses

#cross-validation
cv_scores = cross_validation(x_train, y_train, cv=4)
print("Cross-validation RMSE scores:", cv_scores)

# Calculate Average RMSE
average_rmse = np.mean(cv_scores)
print("Average RMSE:", average_rmse)


4. Please modify the above dataset (in question 3) to answer whether the house will sell for 180000 or not and
use the appropriate algorithm (from question 1 or 2) to learn a model from the training set and answer
whether the prices > 180000 or not for the test set. 30 pts
  1. Report your accuracy. 10 pts


This will utilize logistic regression since we are looking for a binary representation of if the house has sold or not (0 or 1). Logistic regression is defintetly the correct choice for this question and will be implemented accordingly below

In [None]:
import numpy as np
import pandas as pd

# Define the logistic regression function
def logistic_regression(train, weights):
    predictions = list()
    for row in train:
        yhat = predict(row, weights)
        yhat = round(yhat)
        predictions.append(yhat)
    return predictions

# Define the weights_sgd function
def weights_sgd(train, l_rate, n_epoch):
    weights = [0.0 for _ in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, weights)
            error = row[-1] - yhat
            sum_error += error ** 2
            weights[-1] = weights[-1] + l_rate * error
            for i in range(len(row) - 1):
                weights[i] = weights[i] + l_rate * error * row[i]
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return weights

# Define the updated predict function
def predict(row, weights):
    activation = weights[0]
    for i in range(len(row) - 1):
        activation += weights[i + 1] * row[i]
    return 1.0 / (1.0 + np.exp(-activation))

df = pd.read_csv('train.csv')

#1 if SalePrice > 180,000, else 0
df['target_class'] = (df['SalePrice'] > 180000).astype(int)

# Define features (X) and the binary target variable (y)
X = df.drop(columns=['Id', 'SalePrice', 'target_class'])  # Drop unnecessary columns
y = df['target_class']  # Binary target variable

# Perform one-hot encoding for categorical features
X_encoded = pd.get_dummies(X, columns=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                                       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                                       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                                       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
                                       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
                                       'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
                                       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                                       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'])

# Drop rows with missing values
X_encoded.dropna(axis=0, inplace=True)
y = y[X_encoded.index]  # Adjust the target variable accordingly

# Split the dataset into training and testing sets
split_ratio = 0.8
split_index = int(len(X_encoded) * split_ratio)

X_train = X_encoded.iloc[:split_index]
y_train = y.iloc[:split_index]
X_test = X_encoded.iloc[split_index:]
y_test = y.iloc[split_index:]

# Convert the data to numpy arrays
X_train_array = X_train.values
X_test_array = X_test.values
y_train_array = y_train.values
y_test_array = y_test.values

# Train a Logistic Regression classifier using custom functions
learning_rate = 0.01
n_epochs = 100
weights = weights_sgd(X_train_array, learning_rate, n_epochs)

# Predict whether house prices in the test set are greater than $180,000 or not
y_pred = logistic_regression(X_test_array, weights)

# Calculate accuracy and generate a classification report
accuracy = np.mean(y_pred == y_test_array)
print("Accuracy:", accuracy)