# Regression Methods Final Project

In [69]:
import csv
import numpy as np
from math import sqrt
from numpy import linalg as LA
from sklearn.impute import SimpleImputer, KNNImputer
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Linear Regression:

In [71]:
class LinearRegression1:
    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_features = X.shape[1]
        self.bias = 0
        self.weights = np.zeros(n_features)
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        X = X.astype('float64')
        y = y.astype('float64')
        coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        self.bias = coefficients[0]
        self.weights = coefficients[1:]

    def predict(self, X):
        y_predicted = np.dot(X, self.weights) + self.bias
        return y_predicted


Logistic Regression:

In [72]:
class My_LogisticRegression:
    def __init__(self, learning_rate=0.25, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights, self.bias = None, None

    @staticmethod
    def _sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def _binary_cross_entropy(y, y_hat):

        def safe_log(x):
            return 0 if x == 0 else np.log(x)

        total = 0
        for curr_y, curr_y_hat in zip(y, y_hat):
            total += (curr_y * safe_log(curr_y_hat) + (1 - curr_y) * safe_log(1 - curr_y_hat))
        return - total / len(y)

    def fit(self, X, y):
        X = X.astype(np.float64)
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for i in range(self.n_iterations):
            linear_pred = np.dot(X, self.weights) + self.bias
            probability = self._sigmoid(linear_pred)
            # Calculate derivatives
            partial_w = (1 / X.shape[0]) * (np.dot(X.T, (probability - y)))
            partial_d = (1 / X.shape[0]) * (np.sum(probability - y))

            self.weights -= self.learning_rate * partial_w
            self.bias -= self.learning_rate * partial_d

    def predict_proba(self, X):
        X = X.astype(np.float64)
        linear_pred = np.dot(X, self.weights) + self.bias
        return self._sigmoid(linear_pred)

    def predict(self, X, threshold=0.5):
        X = X.astype(np.float64)
        probabilities = self.predict_proba(X)
        return [1 if i > threshold else 0 for i in probabilities]


PCA:

In [73]:
class MY_PCA:
    def __init__(self, k):
        self.k = k
        self.normalized_matrix = None
        self.cov = None
        self.values = None
        self.vectors = None

    def standartization(self, data):
        mean = np.mean(data, axis=0)
        std = np.std(data, axis=0,ddof=1)
        self.normalized_matrix = (data-mean) / std

    def cov_var_matrix(self):
        cov_mat = np.cov(self.normalized_matrix, rowvar=False, bias=False)
        self.cov = cov_mat

    def eigen(self):
        w, v = LA.eig(self.cov)
        idx = w.argsort()[::-1]
        w = w[idx]
        v = v[:, idx]
        selected_w = w.T[:self.k]
        selected_V = v.T[:self.k]
        self.values = selected_w
        self.vectors = selected_V

    def fit(self, data):
        self.standartization(data)
        self.cov_var_matrix()
        self.eigen()

    def transformation(self):
        new_X = np.matmul(self.normalized_matrix, self.vectors.T)
        return new_X



Open the data:

In [74]:
def open_data(path):
    with open(path, "r", encoding="utf-8") as file:
        f = csv.reader(file)
        data = np.array(list(f))
        return data


Data Pre-Processing:

In [75]:
def clean_data(data, fill_nulls=None, test=0):
    new_data = data.copy().astype(object)
    if data.shape[1] > 53:  # only train has 53- "expensive"
        column = data[1:, 53]
        column[column == ''] = np.nan
        # Replace "f" with 0 and "t" with 1
        column = np.where(column == "0", 0, 1)
        new_data[1:, 53] = column

    # host_since 2
    host_since = data[1:, 2]
    host_since[host_since == ''] = np.nan
    years = []
    for date in host_since:
        year = int(date.split('/')[-1])
        years.append(2023 - year)
    years_array = np.array(years)
    new_data[1:, 2] = years_array
    # 3 host_response_time
    host_response_time = new_data[1:, 3]
    mapping = {
        'within an hour': 4,  # BETTER
        'within a few hours': 3,
        'within a day': 2,
        'a few days or more': 1  # WORST
    }
    mapped_values = np.array([mapping.get(value, np.nan) for value in host_response_time])
    new_data[1:, 3] = mapped_values
    # host_response_rate 4
    column = data[1:, 4]
    column[column == ''] = np.nan
    column_without_percent = []
    for value in column:
        if value == 'N/A':
            column_without_percent.append(np.nan)
        else:
            column_without_percent.append(float(value.rstrip('%')) / 100)

    # Calculate the average excluding NaN values
    avg = np.nanmean(column_without_percent)

    # Replace NaN values with the average
    column_without_percent = np.where(np.isnan(column_without_percent), avg, column_without_percent)
    column_without_percent1 = np.array(column_without_percent)
    new_data[1:, 4] = column_without_percent1

    # host_accaptence_rate 5
    column = data[1:, 5]
    column[column == ''] = np.nan
    column_without_percent = []
    for value in column:
        if value == 'N/A':
            column_without_percent.append(np.nan)
        else:
            column_without_percent.append(float(value.rstrip('%')) / 100)
    # Calculate the average excluding NaN values
    avg = np.nanmean(column_without_percent)
    # Replace NaN values with the average
    column_without_percent = np.where(np.isnan(column_without_percent), avg, column_without_percent)
    column_without_percent1 = np.array(column_without_percent)
    new_data[1:, 5] = column_without_percent1

    # super_host 6
    host_is_superhost = data[1:, 6]
    column[column == ''] = np.nan
    # Replace "f" with 0 and "t" with 1
    host_is_superhost = np.where(host_is_superhost == "f", 0, 1)
    new_data[1:, 6] = host_is_superhost

    # 7
    column = data[1:, 7]
    column[column == ''] = np.nan
    new_data[1:, 7] = column.astype(float)

    # 8
    column = data[1:, 8]
    column[column == ''] = np.nan
    new_data[1:, 8] = column.astype(float)

    # 12
    column = data[1:, 12]
    column[column == ''] = np.nan
    new_data[1:, 12] = column.astype(float)

    # 13
    column = data[1:, 13]
    column[column == ''] = np.nan
    new_data[1:, 13] = column.astype(float)

    # 16
    column = data[1:, 16]
    column[column == ''] = np.nan
    new_data[1:, 16] = column.astype(float)

    # 18
    column = new_data[1:, 18]
    column[column == ''] = np.nan
    new_data[1:, 18] = column.astype(float)

    # 19
    column = data[1:, 19]
    column[column == ''] = np.nan
    new_data[1:, 19] = column.astype(float)

    for i in range(21, 29):
        column = data[1:, i]
        column[column == ''] = np.nan
        new_data[1:, i] = column.astype(float)

    for i in range(30, 37):
        column = data[1:, i]
        column[column == ''] = np.nan
        new_data[1:, i] = column.astype(float)

    for i in range(39, 46):
        column = data[1:, i]
        column[column == ''] = np.nan
        new_data[1:, i] = column.astype(float)

    for i in range(48, 53):
        column = data[1:, i]
        column[column == ''] = np.nan
        new_data[1:, i] = column.astype(float)

    # 10
    column = data[1:, 10]
    # Replace "f" with 0 and "t" with 1
    column = np.where(column == "f", 0, 1)
    new_data[1:, 10] = column
    # 11
    column = data[1:, 11]
    # Replace "f" with 0 and "t" with 1
    column = np.where(column == "f", 0, 1)
    new_data[1:, 11] = column
    # 29 has_availability
    column = data[1:, 29]
    # Replace "f" with 0 and "t" with 1
    column = np.where(column == "f", 0, 1)
    new_data[1:, 29] = column
    # 47
    instant = data[1:, 47]
    # Replace "f" with 0 and "t" with 1
    instant = np.where(instant == "f", 0, 1)
    new_data[1:, 47] = instant

    columns_to_drop = ['id', 'host_id','property_type','room_type',
                       'host_verifications', 'bathrooms_text', 'amenities',
                       'license', 'last_review',
                       'first_review', "host_identity_verified",
                       "has_availability", "host_has_profile_pic",
                       'minimum_minimum_nights','maximum_maximum_nights',
                       'minimum_maximum_nights','maximum_minimum_nights',
                       'minimum_nights','availability_60',
                       'availability_365','minimum_nights']
    drop_indices = [i for i, title in enumerate(new_data[0]) if title in columns_to_drop]
    new_data = np.delete(new_data, drop_indices, axis=1)

  #filling nulls in train
    if test == 0:
        imputer = KNNImputer(n_neighbors=70)
        new_data[1:] = imputer.fit_transform(new_data[1:])
        column_means = new_data[1:,].mean(axis=0)

 # filling nulls in test by train values
    if test == 1:
        # Find the columns with null values
        for i in range(0,new_data.shape[1]):
            for j in range(1, new_data.shape[0]):
                if np.isnan(new_data[j, i]):
                  # mean from train of column i
                    new_data[j, i] = fill_nulls[i]
        column_means = fill_nulls

    # review columns
    column_names = new_data[0]
    selected_columns = new_data[1:, np.isin(column_names, ['review_scores_rating',
                                                           'review_scores_accuracy',
                                                           'review_scores_cleanliness',
                                                           'review_scores_checkin',
                                                           'review_scores_communication',
                                                           'review_scores_location',
                                                           'review_scores_value'])]

    #new column review - aggregation of the review columns
    review_column = np.mean(selected_columns.astype(float), axis=1)
    all_data_with_review = np.column_stack((new_data[1:, ], review_column))
    # Add column names and 'review' to the array
    titles = np.concatenate((column_names, ['review']))
    new_data = np.vstack((titles, all_data_with_review))

    columns_to_drop = ['review_scores_rating', 'review_scores_accuracy',
                       'review_scores_cleanliness','review_scores_checkin',
                       'review_scores_communication', 'review_scores_location',
                       'review_scores_value']


    drop_indices = [i for i, title in enumerate(new_data[0]) if title in columns_to_drop]
    new_data = np.delete(new_data, drop_indices, axis=1)

    if 'expensive' in new_data[0]:
        expensive_column_index = np.where(new_data[0] == 'expensive')[0][0]
        # Move the column to the last position
        new_data[:, [expensive_column_index, -1]] = new_data[:, [-1, expensive_column_index]]


    return new_data, column_means

Normalization:

Min-Max on train data

In [76]:
def minmax_on_train(data):
    min_max_values = []
    data_without_first_row = data[1:]
    for i, column in enumerate(data_without_first_row.T):
        min_value = np.min(column)
        max_value = np.max(column)
        min_max_values.append((min_value, max_value))
        for j, value in enumerate(column):
            scaled_value = (value - min_value) / (max_value - min_value)
            data_without_first_row[j, i] = scaled_value
    data_after_min_max = np.vstack((data[0], data_without_first_row))
    return data_after_min_max, min_max_values



Min-Max on test data:

The function will normalized the test data based on the min-max values of the train.

The second argument is the an array with tuples of (min,max) value of each column in the train

In [77]:

def min_max_on_test(data, min_max_by_train):
    test_min_max = data[1:]
    for i, column in enumerate(test_min_max.T):
        min_value = np.min(min_max_by_train[i][0])
        max_value = np.max(min_max_by_train[i][1])
        for j, value in enumerate(column):
            scaled_value = (value - min_value) / (max_value - min_value)
            test_min_max[j, i] = scaled_value
    test_min_max = np.vstack((data[0], test_min_max))
    return test_min_max


Fitting the model on Training set and predicting on Testing set

In [78]:
data_train = open_data("/content/sample_data/train.csv")
data_test = open_data("/content/sample_data/test.csv")

In [79]:
clean_train = (clean_data(data_train))[0]
columns_means_train = (clean_data(data_train))[1]
clean_test = (clean_data(data_test, columns_means_train, 1))[0]
X_train = clean_train[:, :-1]  # Input features (excluding the last column)
y_train = clean_train[0:, -1]  # Target variable (last column)
X_test = clean_test  # the test is only the features


In [80]:
min_max_train = minmax_on_train(X_train)
min_max_train_data = min_max_train[0]# train after min max
min_max_train_values = min_max_train[1]# the min,max for each column by train
min_max_test = min_max_on_test(X_test,min_max_train_values)

In [81]:
model_rl = My_LogisticRegression()
y_train1 = y_train[1:, ].astype(np.float64) #class
model_rl.fit(min_max_train_data[1:, ], y_train1) #fit
y_pred_model_rl = model_rl.predict(min_max_test[1:, ]) #predict class
y_prob = model_rl.predict_proba(min_max_test[1:, ]) #predict probability
print("my predicted probabilities:")
print(y_prob)
y_pred = pd.Series(y_prob, name='prediction')
y_pred.to_csv('prediction.csv', index=False)


my predicted probabilities:
[0.77816021 0.81601169 0.89502025 ... 0.9621787  0.86932429 0.86435212]


comaprison to sklearn models:


In [82]:
pca =MY_PCA(5)
d = min_max_train_data[1:, ].astype(float)
pca.fit(d)
data_after_pca = pca.transformation()
print(data_after_pca)

print("------------------")

sk_pca = PCA(5,random_state=2023)
d = min_max_train_data[1:, ].astype(float)
data_after_pca_sk = sk_pca.fit_transform(d)
print(data_after_pca_sk)

[[-1.32070006  1.05589145  0.95841549 -1.15955979  1.95113887]
 [-1.72217631 -0.31214451  2.04296912 -0.0661073   0.1101341 ]
 [-2.80703065 -1.18509898  2.72961886 -0.39761819  1.12719859]
 ...
 [ 2.64115865 -4.7352117  -0.95374057  0.15569734  1.19182884]
 [ 3.73319598 -4.06974013 -0.36720657 -0.4785281   0.71064052]
 [ 1.6142069   0.02003486  0.91944912  2.16315359 -1.21317153]]
------------------
[[ 0.87173865 -0.16813202 -0.29813838  0.02426754 -0.1342138 ]
 [-0.4366185  -0.34751394  0.28557803 -0.30819223 -0.14345212]
 [ 0.4802321  -0.32342062  0.46861815 -0.54151679 -0.06952668]
 ...
 [-0.39457014  0.57414193  0.36848856  0.33054134  0.69087735]
 [-0.39560035  0.54811973  0.39279652  0.33194863  0.74979791]
 [-0.4815363  -0.09370243  0.95385107  0.50385257  0.14254565]]


In [83]:
lr =LinearRegression1()
d = min_max_train_data[1:, ]
lr.fit(d,y_train1)
prediction = lr.predict(min_max_test[1:,])
print(prediction)

print("------------------")

lr_sk =LinearRegression()
d = min_max_train_data[1:, ]
lr_sk.fit(d,y_train1)
prediction = lr_sk.predict(min_max_test[1:,])
print(prediction)

[0.7772387094757583 0.9364974440450813 0.9615421270580387 ...
 1.1630503623231918 0.9319726009077961 0.9266228554437942]
------------------
[0.77723871 0.93649744 0.96154213 ... 1.16305036 0.9319726  0.92662286]


In [84]:
lr =My_LogisticRegression()
d = min_max_train_data[1:, ]
lr.fit(d,y_train1)
prediction = lr.predict(min_max_test[1:,])
probs = lr.predict_proba(min_max_test[1:,])
print(probs)

print("------------------")

lr_sk =LogisticRegression(max_iter=1000,random_state=2023)
d = min_max_train_data[1:, ]
lr_sk.fit(d,y_train1)
prediction = lr_sk.predict(min_max_test[1:,])
probs = lr_sk.predict_proba(min_max_test[1:,])
print(probs)

[0.77816021 0.81601169 0.89502025 ... 0.9621787  0.86932429 0.86435212]
------------------
[[0.16074484 0.83925516]
 [0.04021555 0.95978445]
 [0.0834169  0.9165831 ]
 ...
 [0.00731717 0.99268283]
 [0.10291227 0.89708773]
 [0.10710063 0.89289937]]


In [85]:
#predictions:
print(y_prob)

[0.77816021 0.81601169 0.89502025 ... 0.9621787  0.86932429 0.86435212]
