In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pickle


In [2]:
# Function to perform Sequential Feature Selection
def perform_sfs_feature_selection(X, y):
    sfs_features_list = []
    
    # Initialize models for feature selection
    lin_model = LinearRegression()
    rf_model = RandomForestRegressor(n_estimators=10, random_state=0)
    
    sfs_models = [lin_model, rf_model]
    
    # Perform feature selection for each model
    for model in sfs_models:
        sfs = SFS(model, k_features=6, forward=True, floating=False, scoring='r2', cv=5)
        sfs_fit = sfs.fit(X, y)
        sfs_features = sfs.transform(X)
        sfs_features_list.append(sfs_features)
    
    return sfs_features_list

In [3]:
# Function to evaluate a regressor
def evaluate_regressor(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    
    # Compute evaluation metrics (MSE and R^2)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, r2

In [4]:
# Function to train Linear Regression model
def train_linear_regression(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    mse, r2 = evaluate_regressor(regressor, X_test, y_test)
    return regressor, mse, r2

In [5]:
# Function to train Random Forest Regressor
def train_random_forest_regressor(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    mse, r2 = evaluate_regressor(regressor, X_test, y_test)
    return regressor, mse, r2

In [6]:
# Function to split and scale the dataset
def split_and_scale_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    
    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler

In [7]:
def main():
    # Load dataset and preprocess
    dataset = pd.read_csv("prep.csv")
    df = pd.get_dummies(dataset, drop_first=True)

    X = df.drop('classification_yes', axis=1)
    y = df['classification_yes']

    # Perform feature selection
    selected_features_list = perform_sfs_feature_selection(X, y)

    # Lists to store evaluation metrics for each model
    r2_linear_regression = []
    r2_random_forest = []

    best_model = None
    best_r2 = -float('inf')
    best_model_name = ""
    best_scaler = None

    # Train and evaluate models for each selected feature set
    for features in selected_features_list:
        X_train, X_test, y_train, y_test, scaler = split_and_scale_data(features, y)
        
        # Train Linear Regression model
        lin_reg, mse_lin, r2_lin = train_linear_regression(X_train, y_train, X_test, y_test)
        r2_linear_regression.append(r2_lin)
        
        # Train Random Forest Regressor
        rf_reg, mse_rf, r2_rf = train_random_forest_regressor(X_train, y_train, X_test, y_test)
        r2_random_forest.append(r2_rf)
        
        # Update the best model if current model has better R^2 score
        if r2_lin > best_r2:
            best_r2 = r2_lin
            best_model = lin_reg
            best_model_name = "LinearRegression"
            best_scaler = scaler
        
        if r2_rf > best_r2:
            best_r2 = r2_rf
            best_model = rf_reg
            best_model_name = "RandomForestRegressor"
            best_scaler = scaler

    print(f"Best Model: {best_model_name}")
    print(f"Best R^2 Score: {best_r2}")

    # Save the best model and scaler
    model_filename = f"finalized_model_{best_model_name.lower()}.sav"
    pickle.dump(best_model, open(model_filename, 'wb'))
    pickle.dump(best_scaler, open('scaler.pkl', 'wb'))

    # Example input for prediction
    example_input = best_scaler.transform([[5, 50, 0, 0, 148.1126761, 0.6]])

    # Load the saved model and make a prediction
    loaded_model = pickle.load(open(model_filename, 'rb'))
    prediction_result = loaded_model.predict(example_input)
    
    print("Prediction result:", prediction_result)

if __name__ == "__main__":
    main()



Best Model: RandomForestRegressor
Best R^2 Score: 0.9717946982218957
Prediction result: [1.]
