In [19]:
import pandas as pd
import numpy as np

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [34]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [33]:
models = {
    "Linear Regressor": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Ada Boost": AdaBoostRegressor(),
    "SVM": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "XG Boost": XGBRegressor()
}

In [40]:
def evaluate_model(true, pred):
    mse = mean_squared_error(true, pred)
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, pred)
    return mse, mae, rmse, r2

In [25]:
data = pd.read_csv("data/50_Startups.csv")

In [26]:
features = data.drop("Profit", axis = 1)
target = data["Profit"]

In [27]:
x_train, x_test, y_train, y_test = train_test_split(features, target, shuffle = True)

In [30]:
print(f"Train data contains {x_train.shape[0]} records.")
print(f"Test data contains {x_test.shape[0]} records.")

Train data contains 37 records.
Test data contains 13 records.


In [37]:
model_list = []
train_mae_list = []
train_mse_list = []
train_rmse_list = []
train_r2_score_list = []
test_mae_list = []
test_mse_list = []
test_rmse_list = []
test_r2_score_list = []

In [41]:
for i in range(len(list(models))):
    # Train model
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # Make Predictions
    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)

    # Evaluate Model
    mse_train, mae_train, rmse_train, r2_train = evaluate_model(y_train, train_pred)
    mse_test, mae_test, rmse_test, r2_test = evaluate_model(y_test, test_pred)

    # Append to the list
    train_mae_list.append(mae_train)
    train_mse_list.append(mse_train)
    train_rmse_list.append(rmse_train)
    train_r2_score_list.append(r2_train)
    test_mae_list.append(mae_test)
    test_mse_list.append(mse_test)
    test_rmse_list.append(rmse_test)
    test_r2_score_list.append(r2_test)
    model_list.append({list(models.keys())[i]: model})
    
    # Print
    print('Model Performance on Training Set')
    print("- Root Mean Squared Error: {:.4f}".format(rmse_train))
    print("- Mean Squared Error: {:.4f}".format(mse_train))
    print("- Mean Absolute Error: {:.4f}".format(mae_train))
    print("- R2 Score: {:.4f}".format(r2_train))

    print('Model Performance on Test Set')
    print("- Root Mean Squared Error: {:.4f}".format(rmse_test))
    print("- Mean Squared Error: {:.4f}".format(mse_test))
    print("- Mean Absolute Error: {:.4f}".format(mae_test))
    print("- R2 Score: {:.4f}".format(r2_test))

    print("="*40)
    print("\n")

Model Performance on Training Set
- Root Mean Squared Error: 9559.5213
- Mean Squared Error: 91384448.2387
- Mean Absolute Error: 7234.5268
- R2 Score: 0.9476
Model Performance on Test Set
- Root Mean Squared Error: 6741.4670
- Mean Squared Error: 45447376.9199
- Mean Absolute Error: 4817.8098
- R2 Score: 0.9562


Model Performance on Training Set
- Root Mean Squared Error: 0.0000
- Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
Model Performance on Test Set
- Root Mean Squared Error: 8182.3058
- Mean Squared Error: 66950128.5123
- Mean Absolute Error: 5888.9054
- R2 Score: 0.9355


Model Performance on Training Set
- Root Mean Squared Error: 4177.0652
- Mean Squared Error: 17447873.4331
- Mean Absolute Error: 3266.6665
- R2 Score: 0.9900
Model Performance on Test Set
- Root Mean Squared Error: 5804.6857
- Mean Squared Error: 33694376.3296
- Mean Absolute Error: 4676.2107
- R2 Score: 0.9676


Model Performance on Training Set
- Root Mean Squared Error: 4213

In [50]:
model_performance = pd.DataFrame({
    "Model Name": [str(list(m.keys())[0]) for m in model_list],
    "Train Mean Absolute Error": train_mae_list,
    "Train Mean Squared Error": train_mse_list,
    "Train Root Mean Square Error": train_rmse_list,
    "Train R2 Score": train_r2_score_list,
    "Test Mean Absolute Error": test_mae_list,
    "Test Mean Squared Error": test_mse_list,
    "Test Root Mean Square Error": test_rmse_list,
    "Test R2 Score": test_r2_score_list,
})

In [52]:
model_performance

Unnamed: 0,Model Name,Train Mean Absolute Error,Train Mean Squared Error,Train Root Mean Square Error,Train R2 Score,Test Mean Absolute Error,Test Mean Squared Error,Test Root Mean Square Error,Test R2 Score
0,Linear Regressor,7234.53,91384448.24,9559.52,0.95,4817.81,45447376.92,6741.47,0.96
1,Decision Tree,0.0,0.0,0.0,1.0,5888.91,66950128.51,8182.31,0.94
2,Random Forest,3266.67,17447873.43,4177.07,0.99,4676.21,33694376.33,5804.69,0.97
3,Ada Boost,3606.31,17749796.82,4213.05,0.99,6981.09,95389903.79,9766.78,0.91
4,SVM,33086.73,1760292083.01,41955.83,-0.01,26833.21,1113989940.45,33376.49,-0.07
5,K-Nearest Neighbors,11498.32,284833654.62,16877.02,0.84,8111.3,106055173.53,10298.31,0.9
6,XG Boost,0.02,0.0,0.02,1.0,8472.51,107569105.14,10371.55,0.9


In [53]:
model_performance.to_csv("Models_Performance.csv")