### LINEAR REGRESSION
In this assignment we try to model the 'Estimated Price' as a linear relation of the other elements

In [21]:
# Importing libraries here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### CLASSES IN PYTHON
Although this might look scary to implement, go about it one function at a time.  
Using classes help with keeping track of multiple models and makes your overall code much tidier.

In [22]:
class LinearRegression:
    def __init__(self) -> None:
       
        self.weights: np.ndarray | None = None
        self.bias: float | None = None
    


    ### TODO 1
    def predict(self, X: np.ndarray) -> np.ndarray:
        
        Y_pred = np.ndarray( np.dot(X,self.weights) + self.bias)
        return Y_pred



    ### TODO 2 
    def __loss(self, X: np.ndarray, y: np.ndarray, norm: int) -> tuple:
       
        predicted = np.dot(X, self.weights) + self.bias

        
        errors = np.abs(predicted - y) ** norm
        loss = (1 / X.shape[0]) * np.sum(errors)

        
        gradient_base = norm * np.sign(predicted - y) * (np.abs(predicted - y) ** (norm - 1))

        dw = (1 / X.shape[0]) * np.dot(X.T, gradient_base)
        db = (1 / X.shape[0]) * np.sum(gradient_base)

        return loss, dw, db

    


    
    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 500, learning_rate: float = 0.01, norm: int = 2, threshold: float = 0.0001) -> None:
        

        self.weights = np.random.randn(X.shape[1])
        self.bias=0
        prev_loss=float('inf')

        for _ in range(epochs):
            y_pred=self.predict(X)
            loss = self.__loss(X, y_pred, norm)
            
            current_loss, dw, db = self.__loss(X, y, norm)
            self.weights -= learning_rate * dw
            self.bias -= learning_rate * db

            if abs(current_loss-prev_loss) <threshold:
                break
            prev_loss=current_loss
        return None

### Importing and Converting Data
Some features in a dataset are not of numerical type and are either categorical or boolean.  
To get past this, we convert the columns by using one-hot encoding.

In [23]:
# importing the data
df = pd.read_csv('linear_data.csv')

### TODO 4
df_onehot=pd.get_dummies(df)

X=df_onehot.drop(columns='Estimated Price')
y=df_onehot['Estimated Price']

X = X.to_numpy() 
y = y.to_numpy() 

### Test-train split
Overfitting is one of the biggest problems in machine learning. Overfitting occurs when the model is trained to be very accurate on the given dataset but performs very poorly on a different but similar dataset.
To check for overfitting, we split our dataset into test and train sets and check the accuracy/loss of the model.

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Z-Score Standardization
Since some features might have much higher values than the others, for weights of similar magnitude, the model will mainly focus only on features with large values.  
To overcome this, we standardize each feature using Z-Score Standardization so that all features are treated equally

In [25]:
# z-score standardization
### TODO 5
def z_score(X: np.ndarray) -> tuple:
    '''
    The Z-Score scales data such that its mean is 0 and standard deviation is 1
    z-score for a value x in the dataset is (x - mean) / std_dev
    (z-score normalization is done over a feature and NOT an entry)
    Return the z-score value of all the elements in the set along with the mean and standard deviation of the original set
    '''

    x_mean = np.mean(X,axis=0)

    x_std = np.std(X,mean=0)

    x = (X-x_mean)/x_std
    return x, x_mean, x_std

In [28]:
# Normalizing the data
x_train, x_mean, x_std = z_score(X_train)
x_test = (X_test - x_mean) / x_std

In [27]:
model = LinearRegression()
model.fit(x_train, y_train, epochs=, learning_rate=, norm=, threshold=)
y_pred = model.predict(x_test)
print("MSE loss: ", np.mean((y_pred - y_test) ** 2))

indices = np.arange(len(y_test))
plt.figure(figsize=(10, 6))
plt.plot(indices, y_test, label='True Values', color='blue', marker='o')
plt.plot(indices, y_pred, label='Predicted Values', color='red', marker='x')

plt.xlabel('Data Points')
plt.ylabel('Values')
plt.title('True vs Predicted Values')

plt.legend()
plt.grid(True)
plt.show()

SyntaxError: invalid syntax (3073035637.py, line 2)