## Regressors

      There are many machine learning algorithms for Regression, but deciding best number of features and best model for getting best accuracy is typical. When we select a model for prediction, then we suffer problems related to underfitting and overfitting. In real scenario, there are more cases of overfitting, which we solve through regulariztion and hyperparameter tuning. Here, I implemented the following models to check best out of them for this cars dataset:
      
              1. Linear Regression
              2. Lasso Regression
              3. Ridge Regression
              4. Elastic Net Regression
              5. Least Angle Regression
              6. Stochastic Gradient Descent Regression
              7. Support Vector Regression
              8. K Neighbors Regression
              9. Decision Tree Regression
              10. Random Forest Regression
              11. XG Boost Regression

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Importing Libraries...

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Loading Data...

In [3]:
data = pd.read_csv("/kaggle/input/cars-dataset/mtcars2.csv")
data.head()

### Renaming column

In [4]:
data = data.rename(columns = {"Unnamed: 0":"Model"})
data.head()

## EDA

In [5]:
data.shape

In [6]:
nulldata = data.isnull().sum()
nulldata[nulldata>0]

In [7]:
data.describe()

In [8]:
data.info()

In [9]:
data = data.drop('Model',axis=1)
data.head()

In [10]:
data = data.rename(columns={'cyl':'cylinder','disp' : 'displacement','hp':'horsepower','wt':'weight'})
data.head()

### Selecting main columns for analysis

In [11]:
data = data[['mpg','cylinder','displacement','horsepower','weight','qsec']]
data.head()

In [12]:
data.plot(kind = 'box',subplots = True, layout = (3,3),figsize = (20,12))
plt.show()

In [13]:
plt.subplots(figsize = (10,8))
sns.heatmap(data.corr(),annot = True)
plt.show()

In [14]:
sns.pairplot(data,diag_kind='kde',kind='reg')
plt.show()

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

### Helper Functions 

In [16]:
def build_model(regression_fn,dataset,x_col,y_col,test_frac,preprocess_fn=None,show_plot = False):
    X = dataset[x_col]
    Y = dataset[y_col]
      
    if preprocess_fn is not None:
        preprocess = preprocess_fn()
        X = preprocess.fit_transform(X)
    
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=test_frac)
    model = regression_fn(x_train,y_train)
    y_pred = model.predict(x_test)
    
    print("Training Score : ",model.score(x_train,y_train))
    print("Testing Score : ",r2_score(y_test,y_pred))
    
    if show_plot is True:
        plt.plot(y_pred,label = 'Predictions')
        plt.plot(y_test.values,label = 'Actual')
        plt.legend()
        plt.show()
     
    return{'training score':model.score(x_train,y_train),
           'testing score': r2_score(y_test,y_pred)}    
    

In [17]:
results_dict = {}

In [18]:
def compare_results():
    for key in results_dict:
        print("Regression : ",key)
        print("Training Score : ",results_dict[key]['training score'])
        print("Testing Score : ",results_dict[key]['testing score'])
        print()

In [19]:
def overfitting():
    overfitting_regression = []
    print("These are the overfitting models : \n")
    for key in results_dict:
        if (results_dict[key]['training score'] > results_dict[key]['testing score']+0.02):
            print("Regression : ",key)
            print("Training Score : ",results_dict[key]['training score'])
            print("Testing Score : ",results_dict[key]['testing score'])
            print()
            overfitting_regression.append(key)
    print("No. of Overfitting models : ",len(overfitting_regression))
    for i in overfitting_regression:
        print(i)            

In [20]:
def underfitting():
    underfitting_regression = []
    print("These are the underfitting models : \n")
    for key in results_dict:
        if (results_dict[key]['training score'] < results_dict[key]['testing score']-0.02):
            print("Regression : ",key)
            print("Training Score : ",results_dict[key]['training score'])
            print("Testing Score : ",results_dict[key]['testing score'])
            print()
            underfitting_regression.append(key)
    print("No. of Underfitting models : ",len(underfitting_regression))
    for i in underfitting_regression:
        print(i)            

In [21]:
def balanced():
    balanced_regression = []
    print("Best models among all the models : \n")
    for key in results_dict:
        if ((results_dict[key]['training score'] < results_dict[key]['testing score']+0.02)
            and (results_dict[key]['training score'] > results_dict[key]['testing score']-0.02)) :
            print("Regression : ",key)
            print("Training Score : ",results_dict[key]['training score'])
            print("Testing Score : ",results_dict[key]['testing score'])
            print()
            balanced_regression.append(key)
    print("No. of Balanced models : ",len(balanced_regression))
    for i in balanced_regression:
        print(i)

### Importing all Regressor Functions 

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [23]:
def linear_reg(x_train,y_train):
    model = LinearRegression()
    model.fit(x_train,y_train)
    return model

In [24]:
def lasso_reg(x_train,y_train,alpha = 0.5):
    model = Lasso(alpha=alpha)
    model.fit(x_train,y_train)
    return model

In [25]:
def ridge_reg(x_train,y_train,alpha = 0.5):
    model = Ridge(alpha=alpha)
    model.fit(x_train,y_train)
    return model

In [26]:
def elastic_net_reg(x_train,y_train,alpha = 1, l1_ratio = 0.5, warm_start = True,max_iter = 10000):
    model = ElasticNet(alpha=alpha,l1_ratio=l1_ratio,warm_start=warm_start,max_iter=max_iter)
    model.fit(x_train,y_train)
    return model

In [27]:
def lars_reg(x_train,y_train,n_nonzero_coefs=4):
    model = Lars(n_nonzero_coefs=n_nonzero_coefs)
    model.fit(x_train,y_train)
    return model

In [28]:
def sgd_reg(x_train,y_train,max_iter = 10000,tol = 1e-3):
    model = SGDRegressor(max_iter=max_iter,tol=tol)
    model.fit(x_train,y_train)
    return model

In [29]:
def svr_reg(x_train,y_train,kernel='linear',epsilon = 0.05, C = 0.3):
    model = SVR(kernel=kernel,epsilon=epsilon,C=C)
    model.fit(x_train,y_train)
    return model


In [30]:
def kneighbor_reg(x_train,y_train,n_neighbors = 10):
    model = KNeighborsRegressor(n_neighbors=n_neighbors)
    model.fit(x_train,y_train)
    return model


In [31]:
def decisiontree_reg(x_train,y_train,max_depth = 2):
    model = DecisionTreeRegressor(max_depth=max_depth)
    model.fit(x_train,y_train)
    return model

In [32]:
def randomforest_reg(x_train,y_train,max_depth = 2):
    model = RandomForestRegressor(max_depth=max_depth)
    model.fit(x_train,y_train)
    return model

In [33]:
def xgb_reg(x_train,y_train):
    model = XGBRegressor()
    model.fit(x_train,y_train)
    return model

In [34]:
results_dict['Single Linear'] = build_model(linear_reg,data,['weight'],'mpg',0.2,None,True)

In [35]:
results_dict['Multi Linear all'] = build_model(linear_reg,data,
                                                     ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [36]:
results_dict['Multi Linear 4 features'] = build_model(linear_reg,data,
                                                            ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [37]:
results_dict['Single Lasso'] = build_model(lasso_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [38]:
results_dict['Multi Lasso all'] = build_model(lasso_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [39]:
results_dict['Multi Lasso 4 features'] = build_model(lasso_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [40]:
results_dict['Single Ridge'] = build_model(ridge_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [41]:
results_dict['Multi Ridge all'] = build_model(ridge_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [42]:
results_dict['Multi Ridge 4 features'] = build_model(ridge_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,None,True)

In [43]:
results_dict['Single Elastic net'] = build_model(elastic_net_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [44]:
results_dict['Multi Elastic net all'] = build_model(elastic_net_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [45]:
results_dict['Multi Elastic net 4 features'] = build_model(elastic_net_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [46]:
results_dict['Single Lars'] = build_model(lars_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [47]:
results_dict['Multi Lars all'] = build_model(lars_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [48]:
results_dict['Multi Lars 4 features'] = build_model(lars_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,None,True)

In [49]:
results_dict['Single SVR'] = build_model(svr_reg,data,['weight'],'mpg',0.2,StandardScaler,True)

In [50]:
results_dict['Multi SVR all'] = build_model(svr_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [51]:
results_dict['Multi SVR 4 features'] = build_model(svr_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [52]:
results_dict['Single SGD'] = build_model(sgd_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [53]:
results_dict['Multi SGD all'] = build_model(sgd_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [54]:
results_dict['Multi SGD 4 features'] = build_model(sgd_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [55]:
results_dict['Single Kneighbors'] = build_model(kneighbor_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [56]:
results_dict['Multi Kneighbors all'] = build_model(kneighbor_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [57]:
results_dict['Multi Kneighbors 4 features'] = build_model(kneighbor_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [58]:
results_dict['Single Decision tree'] = build_model(decisiontree_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [59]:
results_dict['Multi Decision tree all'] = build_model(decisiontree_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [60]:
results_dict['Multi Decision tree 4 features'] = build_model(decisiontree_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [61]:
results_dict['Single Random Forest'] = build_model(randomforest_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [62]:
results_dict['Multi Random Forest all'] = build_model(randomforest_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [63]:
results_dict['Multi Random Forest 4 features'] = build_model(randomforest_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

In [64]:
results_dict['Single xgb'] = build_model(xgb_reg,data,
                                                    ['weight'],'mpg',0.2,StandardScaler,True)

In [65]:
results_dict['Multi xgb all'] = build_model(xgb_reg,data,
                                                    ['cylinder','displacement','horsepower','qsec','weight'],'mpg',0.2,StandardScaler,True)

In [66]:
results_dict['Multi xgb 4 features'] = build_model(xgb_reg,data,
                                                    ['cylinder','displacement','horsepower','weight'],'mpg',0.2,StandardScaler,True)

### Comparing all the results of applied Regressor functions with number of features

In [67]:
compare_results()

### Overfitting models

In [68]:
overfitting()

### Underfitting models

In [69]:
underfitting()

### Balanced Models

In [70]:
balanced()

### Finding best model with number of features with best training and testing score

In [71]:
results = pd.DataFrame(results_dict).transpose()
best = results[(results['training score']< results['testing score'] + 0.02)& (results['training score']>results['testing score'] - 0.02)]
best_model = best.groupby(best.index)[['training score','testing score']].max().sort_values(by = ['testing score'],ascending=False).head(1)
print(f"Best Model with best training and best testing score for this dataset :\n\n{best_model}")

### Cons of this idea:
##### Everytime you run the model, it generates new overfitted, underfitted, and balanced best models.
