# Linear Regression

### 1. Gather Data

In [None]:
import pandas as pd


# Path to your CSV file
csv_file = '../../data/test_sample.csv'
csv_file_bow = '../../data/bow_features.csv'
# Read CSV into a DataFrame
df = pd.read_csv(csv_file)

df_bow= pd.read_csv(csv_file_bow)
df

### 2. Sample Training Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Sample Training Data (x: house size, y: price)
x = np.array([1000, 1500, 2000, 2500, 3000])  # in square feet
y = np.array([200, 300, 400, 500, 600])       # in thousands $
print("houses in square feet :",x)
print("prices in thousands :",y)
m = len(x)  # number of training examples

### 3. Create x as a matrix (n+1 features)

In [None]:
# Step 2: Add x0 = 1 to each input to create x as a matrix (n+1 features)
X = np.c_[np.ones(m), x]  # shape: (m, 2)
print("x0 , x1")
print(X)
# Step 3: Initialize theta as zero vector
theta = np.zeros(2)  # [theta_0, theta_1]
print("theta value : ",theta)

### 4. Define hypothesis function

In [10]:
def h_theta(X, theta):
    return np.dot(X, theta)

### 5.Define cost function J(theta)

In [21]:
def compute_cost(X, y, theta):
    predictions = h_theta(X, theta)
    errors = predictions - y
    return (1 / (2 * m)) * np.dot(errors, errors)

### 6.Gradient Descent

In [22]:
def gradient_descent(X, y, theta, alpha, num_iters):
    cost_history = []

    for i in range(num_iters):
        error = h_theta(X, theta) - y
        gradient = (1 / m) * np.dot(X.T, error)
        theta -= alpha * gradient

        cost_history.append(compute_cost(X, y, theta))
    
    return theta, cost_history

### 7. Run and Plotting the data

In [None]:
alpha = 0.0000001  # learning rate
num_iters = 1000

theta_opt, cost_history = gradient_descent(X, y, theta, alpha, num_iters)

print("Optimized theta:", theta_opt)
print("Final cost:", cost_history[-1])
plt.plot(cost_history)
plt.xlabel("Iteration")
plt.ylabel("Cost J(θ)")
plt.title("Cost Function Convergence")
plt.grid(True)
plt.show()

### 8. Plotting the Linear Regression Fit

In [None]:
plt.scatter(x, y, label='Training Data')
plt.plot(x, h_theta(X, theta_opt), color='red', label='Linear Regression')
plt.xlabel("House Size (sq ft)")
plt.ylabel("Price ($1000s)")
plt.title("Linear Regression Fit")
plt.legend()
plt.grid(True)
plt.show()
