In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv("data/stud.csv")

In [3]:
data

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [4]:
X = data.drop("math_score", axis=1)
y = data["math_score"]

In [20]:
num_features = [column for column in X.columns if X[column].dtypes != "O"]
cat_features = [column for column in X.columns if X[column].dtypes == "O"]

onehot = OneHotEncoder()
standard = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoding", onehot, cat_features),
        ("Standardization", standard, num_features)
    ],
    remainder="passthrough"
)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [6]:
num_features, cat_features

(['reading_score', 'writing_score'],
 ['gender',
  'race_ethnicity',
  'parental_level_of_education',
  'lunch',
  'test_preparation_course'])

In [7]:
X = preprocessor.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=45)

In [9]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np
def regression_model_evaluate(true, predict):
    rmse = np.sqrt(mean_squared_error(true, predict))
    mae = mean_absolute_error(true, predict)
    r_score = r2_score(true, predict)
    return mae, rmse, r_score

In [10]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge Regression": Ridge(),
    "Elastic Net": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "K-Nearest Neighbour": KNeighborsRegressor(),
    "Random Forest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

In [15]:
model_scores = {}

In [16]:
# Model Training
for model_name, model_obj in models.items():
    model_obj.fit(X_train, y_train)
    y_train_predict = model_obj.predict(X_train)
    y_test_predict = model_obj.predict(X_test)

    print(f"\"{model_name}\"", end="\n\n")
    print("{0}Training Performace{0}".format("*"*10))
    print(f"{'='*30}")
    
    mae, rmse, r_score = regression_model_evaluate(y_train, y_train_predict)

    print(f"R2 Score: {r_score}")
    print(f"Mean Absolute error: {mae}")
    print(f"Root Mean square error: {rmse}")

    print(f"{'-'*30}", end="\n\n")

    print("{0}Test Performace{0}".format("*"*10))
    print(f"{'='*30}")
    
    mae, rmse, r_score = regression_model_evaluate(y_test, y_test_predict)

    print(f"R2 Score: {r_score}")
    print(f"Mean Absolute error: {mae}")
    print(f"Root Mean square error: {rmse}")
    print(f"{'='*30}", end="\n\n")

    model_scores[model_name] = r_score

"Linear Regression"

**********Training Performace**********
R2 Score: 0.8797373740834784
Mean Absolute error: 4.190833333333333
Root Mean square error: 5.290248497629073
------------------------------

**********Test Performace**********
R2 Score: 0.8578171545837973
Mean Absolute error: 4.4885
Root Mean square error: 5.55448579978381

"Lasso"

**********Training Performace**********
R2 Score: 0.8192405088937027
Mean Absolute error: 5.0813735866672065
Root Mean square error: 6.485766158800797
------------------------------

**********Test Performace**********
R2 Score: 0.7822212526072458
Mean Absolute error: 5.588426294169948
Root Mean square error: 6.874292509651642

"Ridge Regression"

**********Training Performace**********
R2 Score: 0.8809232273702041
Mean Absolute error: 4.160737643963361
Root Mean square error: 5.264101554024734
------------------------------

**********Test Performace**********
R2 Score: 0.8587633290815598
Mean Absolute error: 4.489780644988463
Root Mean square 

In [19]:
dict(sorted(model_scores.items(), key=lambda item: item[1]))

{'Decision Tree': 0.6784005191013058,
 'Elastic Net': 0.7089477213228434,
 'K-Nearest Neighbour': 0.7712370685523415,
 'Lasso': 0.7822212526072458,
 'XGBoost': 0.7839688375592334,
 'AdaBoost': 0.7994232780802469,
 'Random Forest': 0.8150577366269558,
 'GradientBoost': 0.8407374053989094,
 'Linear Regression': 0.8578171545837973,
 'Ridge Regression': 0.8587633290815598}