In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('boston.csv')

In [10]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [6]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [9]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib


# Define input features and output variable
X = df.drop('MEDV', axis=1)
y = df['MEDV']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Create a dictionary to store the models and their scalers
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR': SVR()
}

# Function to evaluate and save model performance
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name, scaler_name):
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    # Save the model and the scaler
    joblib.dump(model, f'{model_name}_{scaler_name}.pkl')
    joblib.dump(scalers[scaler_name], f'{scaler_name}.pkl')
    
    return {
        'Model': model_name,
        'Scaler': scaler_name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'CV Mean Score': np.mean(cv_scores),
        'CV Std Dev': np.std(cv_scores)
    }

# List to store evaluation results
evaluation_results = []

# Train and evaluate models with different scalers
for scaler_name, scaler in scalers.items():
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for model_name, model in models.items():
        results = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name, scaler_name)
        evaluation_results.append(results)
print(evaluation_results)
# Convert evaluation results to a DataFrame for better visualization
results_df = pd.DataFrame(evaluation_results)

# Save the results to a CSV file
results_df.to_csv('model_evaluation_results.csv', index=False)

results_df


[{'Model': 'LinearRegression', 'Scaler': 'StandardScaler', 'MSE': 24.291119474973517, 'MAE': 3.1890919658878483, 'R2': 0.668759493535632, 'CV Mean Score': 0.724435734224026, 'CV Std Dev': 0.06353380118752597}, {'Model': 'Ridge', 'Scaler': 'StandardScaler', 'MSE': 24.31290383049162, 'MAE': 3.1857238072445964, 'R2': 0.6684624359643558, 'CV Mean Score': 0.724514155284955, 'CV Std Dev': 0.06351058831040231}, {'Model': 'Lasso', 'Scaler': 'StandardScaler', 'MSE': 27.57769177613475, 'MAE': 3.4737704435416634, 'R2': 0.6239428734251422, 'CV Mean Score': 0.6568148909692434, 'CV Std Dev': 0.04555527414245711}, {'Model': 'DecisionTreeRegressor', 'Scaler': 'StandardScaler', 'MSE': 10.442156862745097, 'MAE': 2.3588235294117648, 'R2': 0.8576078253058844, 'CV Mean Score': 0.6375796563473883, 'CV Std Dev': 0.05433643717373799}, {'Model': 'RandomForestRegressor', 'Scaler': 'StandardScaler', 'MSE': 8.823070754901972, 'MAE': 2.1200686274509812, 'R2': 0.879686136802558, 'CV Mean Score': 0.8167254337186517,

Unnamed: 0,Model,Scaler,MSE,MAE,R2,CV Mean Score,CV Std Dev
0,LinearRegression,StandardScaler,24.291119,3.189092,0.668759,0.724436,0.063534
1,Ridge,StandardScaler,24.312904,3.185724,0.668462,0.724514,0.063511
2,Lasso,StandardScaler,27.577692,3.47377,0.623943,0.656815,0.045555
3,DecisionTreeRegressor,StandardScaler,10.442157,2.358824,0.857608,0.63758,0.054336
4,RandomForestRegressor,StandardScaler,8.823071,2.120069,0.879686,0.816725,0.054876
5,GradientBoostingRegressor,StandardScaler,6.307704,1.924274,0.913986,0.850413,0.054162
6,SVR,StandardScaler,25.66854,2.731716,0.649977,0.626847,0.035058
7,LinearRegression,MinMaxScaler,24.291119,3.189092,0.668759,0.724436,0.063534
8,Ridge,MinMaxScaler,23.730076,3.115059,0.67641,0.722869,0.055356
9,Lasso,MinMaxScaler,54.458243,5.114499,0.257392,0.231054,0.01487
