In [2]:
import pandas as pd
from sklearn.datasets import make_regression

#  synthetic regression dataset
X, y = make_regression(n_samples=500, n_features=13, noise=10.0, random_state=42)

#  DataFrame with 13 features like boston housing
feature_names = [f'Feature_{i+1}' for i in range(13)]
df = pd.DataFrame(X, columns=feature_names)

# Add the target column (house prices)
df['PRICE'] = y

# Show the first few rows
print(df.head())


   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   0.446982  -2.153731  -0.638586   1.223856   0.239858  -0.592241   
1  -0.230401  -0.758495  -0.924233   0.260281  -1.846188  -0.957151   
2   0.281009  -1.315816  -2.132596   0.307613   3.243093   1.352203   
3  -0.389924  -1.751829   0.158053   0.313184  -0.945746   0.455904   
4  -1.571152   1.006730   1.081514  -0.350778  -0.070220   0.715493   

   Feature_7  Feature_8  Feature_9  Feature_10  Feature_11  Feature_12  \
0   0.486310   1.653310   0.337956    0.000756    0.623087    0.975713   
1  -0.929511   0.890198  -0.963759   -0.048652    1.035249    0.343788   
2   2.307916   1.012637  -0.524567    0.039447   -0.158154    0.424067   
3   0.608246  -0.096624  -0.606503   -0.694600   -0.415967   -0.459090   
4  -2.522278   0.037542  -0.227720   -1.520287    1.091805   -1.399078   

   Feature_13       PRICE  
0    0.685858  107.145884  
1    0.032797 -105.290296  
2   -1.435910 -107.635261  
3   -1.154363 -4

In [3]:
from sklearn.preprocessing import StandardScaler


X = df.drop('PRICE', axis=1)
y = df['PRICE']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print(X_scaled.head())


   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   0.442752  -2.066341  -0.525118   1.206881   0.274816  -0.598714   
1  -0.245502  -0.700016  -0.800215   0.220096  -1.765479  -0.963312   
2   0.274116  -1.245788  -1.963945   0.268568   3.212185   1.344068   
3  -0.407586  -1.672767   0.242097   0.274274  -0.884786   0.448535   
4  -1.607773   1.028633   1.131448  -0.405682  -0.028462   0.707902   

   Feature_7  Feature_8  Feature_9  Feature_10  Feature_11  Feature_12  \
0   0.550403   1.613891   0.309583   -0.061330    0.593682    1.046821   
1  -0.902937   0.875005  -1.067124   -0.110160    1.008923    0.381229   
2   2.420281   0.993557  -0.602630   -0.023092   -0.193395    0.465785   
3   0.675570  -0.080491  -0.689286   -0.748545   -0.453134   -0.464424   
4  -2.537912   0.049416  -0.288681   -1.564565    1.065902   -1.454492   

   Feature_13  
0    0.690171  
1    0.039674  
2   -1.423265  
3   -1.142824  
4    0.456198  


In [4]:
import numpy as np

# Convert DataFrames to numpy arrays
X = X_scaled.values  # Features (500, 13)
y = y.values.reshape(-1, 1)  # Target (500, 1)

# Add a bias term (X0 = 1) for the intercept
X = np.c_[np.ones((X.shape[0], 1)), X]  # Shape (500, 14)

# Initialize weights randomly
theta = np.random.randn(X.shape[1], 1)  # (14, 1)
learning_rate = 0.01
epochs = 1000

# Gradient Descent
for i in range(epochs):
    # Predictions: X * theta
    y_pred = np.dot(X, theta)
    
    # Compute error
    error = y_pred - y
    
    # Compute gradient
    gradients = (2 / X.shape[0]) * np.dot(X.T, error)
    
    # Update weights
    theta -= learning_rate * gradients
    
    # Loss every 100 epochs
    if i % 100 == 0:
        mse = np.mean(error ** 2)
        print(f'Epoch {i}: MSE = {mse:.4f}')

# Final weights
print("\nFinal Weights (Theta):\n", theta)


Epoch 0: MSE = 41473.8717
Epoch 100: MSE = 1204.4911
Epoch 200: MSE = 146.4870
Epoch 300: MSE = 110.7564
Epoch 400: MSE = 109.3804
Epoch 500: MSE = 109.3241
Epoch 600: MSE = 109.3217
Epoch 700: MSE = 109.3216
Epoch 800: MSE = 109.3216
Epoch 900: MSE = 109.3216

Final Weights (Theta):
 [[ 4.60894130e+00]
 [ 9.57796055e+01]
 [ 9.50214375e+01]
 [ 1.20075366e+00]
 [ 3.28556056e+01]
 [ 5.62937713e-02]
 [ 2.21277684e+01]
 [-4.04290447e-01]
 [ 6.40174881e+01]
 [ 6.35028673e+01]
 [ 9.29062109e+01]
 [-3.69751607e-01]
 [ 5.61370946e+01]
 [ 7.99015708e+01]]


In [5]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
y_pred = np.dot(X, theta)

# RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred))
# R² Score
r2 = r2_score(y, y_pred)

print(f"📊 RMSE: {rmse:.4f}")
print(f"📈 R² Score: {r2:.4f}")


📊 RMSE: 10.4557
📈 R² Score: 0.9974


In [10]:
import numpy as np

class SimpleRandomForest:
    def __init__(self, n_trees=10, max_samples=0.8):
        self.n_trees = n_trees
        self.max_samples = max_samples  # Fraction of data for bootstrap
        self.trees = []

    def bootstrap_sample(self, X, y):
        # Randomly sample a fraction of the data with replacement
        n_samples = int(X.shape[0] * self.max_samples)
        indices = np.random.choice(X.shape[0], n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            # Create bootstrap sample
            X_sample, y_sample = self.bootstrap_sample(X, y)
            # Create a simple decision stump (one-level tree)
            tree = self.decision_stump(X_sample, y_sample)
            self.trees.append(tree)

    def decision_stump(self, X, y):
        # Find the best feature and threshold to split on
        best_feature, best_threshold = None, None
        best_mse = float('inf')
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] < threshold
                right_mask = ~left_mask

                # Split the data
                if sum(left_mask) == 0 or sum(right_mask) == 0:
                    continue  # Skip empty splits

                # Calculate MSE for left and right splits
                mse_left = np.mean((y[left_mask] - np.mean(y[left_mask])) ** 2)
                mse_right = np.mean((y[right_mask] - np.mean(y[right_mask])) ** 2)
                mse = (mse_left * sum(left_mask) + mse_right * sum(right_mask)) / len(y)

                # Keep track of the best split
                if mse < best_mse:
                    best_feature, best_threshold, best_mse = feature, threshold, mse

        return (best_feature, best_threshold)

    def predict_single(self, x, tree):
        # Predict based on a single decision stump
        feature, threshold = tree
        return 1 if x[feature] < threshold else 0

    def predict(self, X):
        # Collect predictions from all trees
        predictions = []
        for i in range(X.shape[0]):
            tree_preds = [self.predict_single(X[i], tree) for tree in self.trees]
            # Average of predictions (majority voting)
            predictions.append(np.mean(tree_preds))
        return np.array(predictions)

# Initialize and train the model
rf_model = SimpleRandomForest(n_trees=10)
rf_model.fit(X, y)

# Make predictions
y_pred_rf = rf_model.predict(X)

print("✅ Random Forest (Minimal) model trained successfully!")


✅ Random Forest (Minimal) model trained successfully!


In [11]:
from sklearn.metrics import mean_squared_error, r2_score

# Reshape predictions for compatibility if needed
y_pred_rf = y_pred_rf.flatten()

# RMSE
rmse_rf = np.sqrt(mean_squared_error(y, y_pred_rf))

# R² Score
r2_rf = r2_score(y, y_pred_rf)

print(f"📊 RMSE (Random Forest): {rmse_rf:.4f}")
print(f"📈 R² Score (Random Forest): {r2_rf:.4f}")


📊 RMSE (Random Forest): 206.1930
📈 R² Score (Random Forest): -0.0019


In [24]:
import numpy as np

class SimpleRandomForest:
    def __init__(self, n_estimators=10, max_features=5):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.models = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        for _ in range(self.n_estimators):
            # (Random Rows)
            idx = np.random.choice(n_samples, n_samples, replace=True)
            # Random Feature Selection
            features = np.random.choice(n_features, self.max_features, replace=False)
            
            # Fit a simple linear model on selected features
            X_sample = X[idx][:, features]
            y_sample = y[idx]
            coef = np.linalg.pinv(X_sample) @ y_sample  # Linear regression
            
            # Store model and features
            self.models.append((features, coef))
    
    def predict(self, X):
        preds = []
        for features, coef in self.models:
            # Get predictions for each tree
            preds.append(X[:, features] @ coef)  # Fixed this line
        return np.mean(preds, axis=0)  # Average predictions

# Initialize and train the model
rf_model = SimpleRandomForest(n_estimators=10, max_features=5)
rf_model.fit(X, y)

# Make predictions
y_pred_rf = rf_model.predict(X)

# Evaluate
from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y, y_pred_rf))
r2 = r2_score(y, y_pred_rf)

print(f"🌳 Random Forest - RMSE: {rmse:.4f}")
print(f"🌳 Random Forest - R² Score: {r2:.4f}")


🌳 Random Forest - RMSE: 139.4406
🌳 Random Forest - R² Score: 0.5418


In [19]:
# Assuming 'theta' is the final weights from Linear Regression
y_pred_linear = X.dot(theta)

# Evaluate Linear Regression
from sklearn.metrics import mean_squared_error, r2_score
rmse_linear = np.sqrt(mean_squared_error(y, y_pred_linear))
r2_linear = r2_score(y, y_pred_linear)

print(f"📈 Linear Regression - RMSE: {rmse_linear:.4f}")
print(f"📈 Linear Regression - R² Score: {r2_linear:.4f}")



📈 Linear Regression - RMSE: 10.4557
📈 Linear Regression - R² Score: 0.9974


In [20]:
print("📊 Model Performance Comparison:")
print(f"Linear Regression - RMSE: {rmse_linear:.4f}, R²: {r2_linear:.4f}")
print(f"Random Forest - RMSE: {rmse:.4f}, R²: {r2:.4f}")


📊 Model Performance Comparison:
Linear Regression - RMSE: 10.4557, R²: 0.9974
Random Forest - RMSE: 140.3760, R²: 0.5356
