In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from itertools import combinations
import pandas as pd

In [2]:
original_data = pd.read_excel('/content/OriginalData.xlsx')

In [3]:
original_data.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100%_Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100%_Natural_Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran_with_Extra_Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond_Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [4]:
rows, columns = original_data.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")

Number of rows: 77
Number of columns: 16


In [8]:
X_new = original_data[['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'weight', 'cups']]
y_new = original_data['rating']

Forward Selection using adjusted R^2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=1)

# adjusted R-squared calc.
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

best_features_adj_r2 = None
highest_adj_r2 = float('-inf')
feature_adj_r2 = {}

# Forward Selection with adjusted R square evaluation
for i in range(1, len(X_train.columns) + 1):
    for combo in combinations(X_train.columns, i):
        # Select subset of features
        X_train_subset = X_train[list(combo)]
        X_test_subset = X_test[list(combo)]

        # Train the linear regression model on the selected features
        model = LinearRegression()
        model.fit(X_train_subset, y_train)

        # Calculate R-squared on the test set
        r2 = model.score(X_test_subset, y_test)

        adj_r2 = adjusted_r2(r2, len(y_test), len(combo))
        feature_adj_r2[combo] = adj_r2

        # Update the best feature set if set has higher R2
        if adj_r2 > highest_adj_r2:
            highest_adj_r2 = adj_r2
            best_features_adj_r2 = combo

print("Best Feature Subset:", best_features_adj_r2)
print("Highest Adjusted R-squared:", highest_adj_r2)


Best Feature Subset: ('calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins')
Highest Adjusted R-squared: 0.9999999999999987
