In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Read the dataset and prepare data


In [2]:
df = pd.read_csv('dataset/manufacturing.csv')
target = df.columns[-1]
features = df.columns[:-1]

X = df[features]
y = df[target]


# Helper/Computational Functions

In [3]:
def create_poly_features(X, degree=2):
    poly = PolynomialFeatures(degree=degree)
    return poly.fit_transform(X), poly.get_feature_names_out()

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def calculate_metrics(X_poly, y, selected_features, method_name):
    model = LinearRegression()
    model.fit(X_poly[:, selected_features], y)
    y_pred = model.predict(X_poly[:, selected_features])
    
    r2 = r2_score(y, y_pred)
    adj_r2 = adjusted_r2(r2, len(y), len(selected_features))
    
    print(f"\n{method_name} Metrics:")
    print(f"Number of features selected: {len(selected_features)}")
    print(f"R²: {r2:.4f}")
    print(f"Adjusted R²: {adj_r2:.4f}")
    
    return r2, adj_r2

# Backward Elimination

In [5]:
def run_backward_elimination():
    print("\n=== Running Backward Elimination ===")
    X_poly, feature_names = create_poly_features(X)
    selected_features = list(range(len(feature_names)))
    
    while len(selected_features) > 1:
        best_r2 = -np.inf
        worst_feature = None
        
        for feature in selected_features:
            current_features = [f for f in selected_features if f != feature]
            model = LinearRegression()
            model.fit(X_poly[:, current_features], y)
            r2 = r2_score(y, model.predict(X_poly[:, current_features]))
            
            if r2 > best_r2:
                best_r2 = r2
                worst_feature = feature
        
        if best_r2 > r2_score(y, LinearRegression().fit(X_poly[:, selected_features], y).predict(X_poly[:, selected_features])):
            selected_features.remove(worst_feature)
        else:
            break
    
    return calculate_metrics(X_poly, y, selected_features, "Backward Elimination")

run_backward_elimination()


=== Running Backward Elimination ===

Backward Elimination Metrics:
Number of features selected: 18
R²: 0.8745
Adjusted R²: 0.8740


(0.8745362596743205, 0.8739627839694291)

# Forward Elimination

In [6]:
def run_forward_selection():
    print("\n=== Running Forward Selection ===")
    X_poly, feature_names = create_poly_features(X)
    n_features = X_poly.shape[1]
    selected_features = []
    
    while len(selected_features) < n_features:
        best_r2 = -np.inf
        best_feature = None
        
        for feature in range(n_features):
            if feature not in selected_features:
                current_features = selected_features + [feature]
                model = LinearRegression()
                model.fit(X_poly[:, current_features], y)
                r2 = r2_score(y, model.predict(X_poly[:, current_features]))
                
                if r2 > best_r2:
                    best_r2 = r2
                    best_feature = feature
        
        if best_feature is not None:
            if len(selected_features) > 0:
                current_r2 = r2_score(y, LinearRegression().fit(X_poly[:, selected_features], y).predict(X_poly[:, selected_features]))
                if best_r2 - current_r2 < 0.01:
                    break
            selected_features.append(best_feature)
        else:
            break
    
    return calculate_metrics(X_poly, y, selected_features, "Forward Selection")

run_forward_selection()


=== Running Forward Selection ===

Forward Selection Metrics:
Number of features selected: 3
R²: 0.8045
Adjusted R²: 0.8044


(0.8045066153463661, 0.8043582520390145)

# Bidirectional Elimination

In [7]:
def run_bidirectional_selection():
    print("\n=== Running Bidirectional Selection ===")
    X_poly, feature_names = create_poly_features(X)
    n_features = X_poly.shape[1]
    selected_features = []
    
    while len(selected_features) < n_features:
        # Forward step
        best_addition = None
        best_add_r2 = -np.inf
        
        for feature in range(n_features):
            if feature not in selected_features:
                current_features = selected_features + [feature]
                model = LinearRegression()
                model.fit(X_poly[:, current_features], y)
                r2 = r2_score(y, model.predict(X_poly[:, current_features]))
                
                if r2 > best_add_r2:
                    best_add_r2 = r2
                    best_addition = feature
        
        # Backward step
        if len(selected_features) > 1:
            worst_removal = None
            best_remove_r2 = -np.inf
            
            for feature in selected_features:
                current_features = [f for f in selected_features if f != feature]
                model = LinearRegression()
                model.fit(X_poly[:, current_features], y)
                r2 = r2_score(y, model.predict(X_poly[:, current_features]))
                
                if r2 > best_remove_r2:
                    best_remove_r2 = r2
                    worst_removal = feature
        
        # Update features
        if best_addition is not None:
            if len(selected_features) > 0:
                current_r2 = r2_score(y, LinearRegression().fit(X_poly[:, selected_features], y).predict(X_poly[:, selected_features]))
                if best_add_r2 - current_r2 < 0.01:
                    break
            selected_features.append(best_addition)
        else:
            break
    
    return calculate_metrics(X_poly, y, selected_features, "Bidirectional Selection")

run_bidirectional_selection()


=== Running Bidirectional Selection ===

Bidirectional Selection Metrics:
Number of features selected: 3
R²: 0.8045
Adjusted R²: 0.8044


(0.8045066153463661, 0.8043582520390145)

# Keep All Variables

In [8]:
def run_keep_all_variables():
    print("\n=== Running with All Variables ===")
    X_poly, feature_names = create_poly_features(X)
    selected_features = list(range(X_poly.shape[1]))
    
    return calculate_metrics(X_poly, y, selected_features, "Keep All Variables")

run_keep_all_variables()


=== Running with All Variables ===

Keep All Variables Metrics:
Number of features selected: 21
R²: 0.8568
Adjusted R²: 0.8561


(0.8568162644970339, 0.8560521327446673)