# Lab | Comparing regression models


For this lab, we will be using the same dataset we used in the previous labs. Load the cleaned categorical and numerical dataframes that you saved at the end of Monday's labs.

## Data Analysis Process
#### Remember the process:

- Case Study
- Get data
- Cleaning/Wrangling/EDA
- **Processing Data**
- **Modeling**
- **Validation**
- Reporting

### Instructions

Concatenate Numerical and Categorical dataframes into one dataframe called data.



In [1]:
import pandas as pd
import numpy as np

In [2]:
#Importing numerical file

numerical = pd.read_csv('../lab-cleaning-numerical-data/files_for_lab/numerical.csv')
display("Numerical shape: ", numerical.shape)

#Importing categorical file

categorical = pd.read_csv('../lab-cleaning-categorical-data/files_for_lab/categorical.csv')
display("Categorical shape: ", numerical.shape)

'Numerical shape: '

(9134, 8)

'Categorical shape: '

(9134, 8)

In [3]:
# Concatenating both dataframes

data = pd.concat([numerical, categorical], axis=1)

display(data.head(),data.shape)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,state,response,...,effective_to_date,employmentstatus,gender,location_code,marital_status,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,2763.519279,56274.0,69,32,5,0,1,384.811147,Washington,No,...,2011-02,Employed,F,Suburban,Married,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,6979.535903,0.0,94,13,42,0,8,1131.464935,Arizona,No,...,2011-01,Unemployed,F,Suburban,Single,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,12887.43165,48767.0,108,18,38,0,2,566.472247,Nevada,No,...,2011-02,Employed,F,Suburban,Married,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,7645.861827,0.0,106,18,65,0,7,529.881344,California,No,...,2011-01,Unemployed,M,Suburban,Married,Corporate L2,Offer1,Call Center,SUV,Medsize
4,2813.692575,43836.0,73,12,44,0,1,138.130879,Washington,No,...,2011-02,Employed,M,Rural,Single,Personal L1,Offer1,Agent,Four-Door Car,Medsize


(9134, 22)

#### 1. In this final lab, we will model our data. Import sklearn `train_test_split` and separate the data.



In [4]:
# Importing library

from sklearn.model_selection import train_test_split

In [5]:
# y-X split

y = data['total_claim_amount']
X = data.drop(['total_claim_amount'], axis=1)

#### 2. Separate X_train and X_test into numerical and categorical (X_train_cat , X_train_num , X_test_cat , X_test_num)


In [6]:
# Train-Test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)

X_train:  (7307, 21)
X_test:  (1827, 21)
y_train:  (7307,)
y_test:  (1827,)


In [7]:
# Getting numerical data for X_train and X_test

numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

numericals_train.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
3933,4670.058282,69921.0,118,28,22,2,1
588,4458.113369,17622.0,65,1,36,1,3
8693,2648.350463,29587.0,69,8,68,0,1
3423,2722.408776,58620.0,68,8,60,0,1
4433,5207.197499,21576.0,65,27,16,0,3


In [8]:
# Getting categorical data for X_train and X_test

categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)

categoricals_train.head()

Unnamed: 0,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
3933,Arizona,No,Premium,Bachelor,2011-02,Employed,F,Urban,Divorced,Personal L3,Offer1,Agent,Two-Door Car,Medsize
588,California,Yes,Basic,Bachelor,2011-01,Retired,M,Suburban,Divorced,Personal L3,Offer1,Agent,Four-Door Car,Medsize
8693,California,No,Basic,High School or Below,2011-01,Medical Leave,F,Suburban,Married,Personal L2,Offer4,Web,Two-Door Car,Small
3423,California,No,Basic,High School or Below,2011-01,Employed,M,Urban,Divorced,Personal L3,Offer2,Branch,Two-Door Car,Large
4433,California,No,Basic,Doctor,2011-01,Employed,F,Suburban,Married,Personal L3,Offer4,Web,Four-Door Car,Medsize


#### 3. Use X_train_num to fit scalers.  Transform BOTH X_train_num and X_test_num.


In [9]:
# Importing library to use StandardScaler

from sklearn.preprocessing import StandardScaler

# Fitting transformer with numerical X_train

transformer = StandardScaler().fit(numericals_train)

# Applying transformer to numerical X_train

numericals_train_standardized = transformer.transform(numericals_train)

# Applying transformer to numerical X_test

numericals_test_standardized = transformer.transform(numericals_test)

In [10]:
#array

In [11]:
pd.DataFrame(numericals_train_standardized).head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.484172,1.062465,0.713631,1.286392,-0.938702,1.767958,-0.824104
1,-0.514795,-0.655509,-0.815185,-1.388838,-0.435063,0.673451,0.010971
2,-0.776275,-0.26247,-0.699802,-0.69526,0.716112,-0.421056,-0.824104
3,-0.765575,0.691237,-0.728648,-0.69526,0.428318,-0.421056,-0.824104
4,-0.406564,-0.525624,-0.815185,1.18731,-1.154547,-0.421056,0.010971



#### 4. Encode the categorical variables X_train_cat and X_test_cat (See the hint below for encoding categorical data!!!)


In [12]:
# Importing library to use OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

# Fitting encoder with categorical X_train

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)

# Applying encoder to categorical X_train

categoricals_train_encoded = encoder.transform(categoricals_train).toarray()

# Applying encoder to categorical X_test

categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

In [13]:
#array


#### 5. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

##### ***********************************************
##### Hint for Categorical Variables

You should deal with the categorical variables as shown below (for ordinal encoding, dummy code has been provided as well):
Encoder Type | Column 
-----------------|-----------------
One hot | state

Ordinal | coverage

Ordinal | employmentstatus

Ordinal | location code

Ordinal | vehicle size


One hot | marital status

One hot | policy type

One hot | policy

One hot | renew offercustomer_df

One hot | sales channel

One hot | vehicle class



###### Dummy code

data["coverage"] = data["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

given that column "coverage" in the dataframe "data" has three categories:

"basic", "extended", and "premium" and values are to be represented in the same order.
# ******************************************************



**Comment:**

I'm not applying the Ordinal Encoding as I consider that the rank or hierarchy that it may preserve is not relevant for the later linear regression or KNeighbours results.

OneHotEncoder may generate more columns, but in this case is not an issue that these predicting models cannot handle.

After checking score results with teammates that actually did the ordinal encoding, their models obtained worse results for their predictions.


#### 6. Try a simple linear regression with all the data to see whether we are getting good results.



In [14]:
# Concatenating numericals and categoricals from both train and test subsets

X_train_processed = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test_processed = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

In [15]:
pd.DataFrame(X_train_processed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,-0.484172,1.062465,0.713631,1.286392,-0.938702,1.767958,-0.824104,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.514795,-0.655509,-0.815185,-1.388838,-0.435063,0.673451,0.010971,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.776275,-0.262470,-0.699802,-0.695260,0.716112,-0.421056,-0.824104,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,-0.765575,0.691237,-0.728648,-0.695260,0.428318,-0.421056,-0.824104,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.406564,-0.525624,-0.815185,1.187310,-1.154547,-0.421056,0.010971,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,-0.163959,-0.097305,-0.180582,-0.893425,0.895983,-0.421056,1.681121,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7303,0.153083,-0.120891,0.598249,-0.001681,0.680138,-0.421056,2.098658,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7304,-0.414768,0.921477,-0.844030,0.592814,-0.722857,-0.421056,0.010971,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7305,2.357425,-1.234376,-0.238273,-0.992507,-1.190521,0.673451,-0.406566,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# Importing libraries 

from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [17]:
# Linear Regression

lm = linear_model.LinearRegression()
lm.fit(X_train_processed,y_train)

LinearRegression()

In [18]:
# Predictions on TRAIN data and R2 score 

predictions = lm.predict(X_train_processed)
r2_score(y_train, predictions)

0.7723136419160986

In [19]:
# Preedictions on TEST data and R2 score

predictions_test = lm.predict(X_test_processed)
r2_score(y_test, predictions_test)

0.7719599560601261

#### 7. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

  

In [20]:
# We got X_train_processed, X_test_processed, y_train, y_test

In [21]:
from sklearn.neighbors import KNeighborsRegressor

In [22]:
# Function to be applied on Linear Regression and KNeighbours

def models_train_test(k):
    
    # Linear Regression
    
    # Fitting and training the model
    lm = linear_model.LinearRegression()
    lm.fit(X_train_processed,y_train)
    # Predictions on TRAIN data and R2 score 
    #lm_train_predictions = lm.predict(X_train_processed)
    #lm_train_r2 = r2_score(y_train, lm_train_predictions)
    # Predictions on TEST data and R2 score
    lm_test_predictions = lm.predict(X_test_processed)
    lm_test_r2 = r2_score(y_test, lm_test_predictions)

    
    #KNeigbhours
   
    # Fitting and traing the model
    knm = KNeighborsRegressor(n_neighbors=k)
    knm.fit(X_train_processed, y_train)
    # Predictions on TRAIN data and score 
    #kn_train_score = knm.score(X_train_processed, y_train)
    # Predictions on TEST data and score
    kn_test_score = knm.score(X_test_processed, y_test)
    kn_test_predictions = knm.predict(X_test_processed)
    
    
    
    return lm_test_r2, kn_test_score, k
        

#### 8. Use the function to check `LinearRegressor` and `KNeighborsRegressor`.



Function arguments (k):
- k = Number of neighbours for KNeighbors Regressor

In [23]:
# Applying function

lm_test_r2, kn_test_score, k = models_train_test(25)

In [24]:
print("KNeighbour TEST score: {} (k set to {}).\nLinear Regression R2 TEST score: {}".format(kn_test_score, k, lm_test_r2))

KNeighbour TEST score: 0.6610546553225504 (k set to 25).
Linear Regression R2 TEST score: 0.7719599560601261


#### 9. You can check also the `MLPRegressor` for this task!



Multi-layer perceptrons (MLP) is an artificial neural network that has 3 or more layers of perceptrons. These layers are- a single input layer, 1 or more hidden layers, and a single output layer of perceptrons. The data flows in a single direction, that is forward, from the input layers-> hidden layer(s) -> output layer. Backpropagation is a technique where the multi-layer perceptron receives feedback on the error in its results and the MLP adjusts its weights accordingly to make more accurate predictions in the future. MLP is used in many machine learning techniques like classification and regression. They have been shown to give highly accurate results for classification problems in particular.

This model optimizes the squared error using LBFGS or stochastic gradient descent.

MLP Regressor Model

MLPRegressor model with 2 hidden layers of 50 neurons each

model = MLPRegressor(hidden_layer_sizes=(50, 50))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [25]:
from sklearn import datasets
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

In [26]:
# Adding MLP Regressor to the function

def models_train_test(k,lay_1,lay_2):
    
    # Linear Regression
    
    # Fitting and training the model
    lm = linear_model.LinearRegression()
    lm.fit(X_train_processed,y_train)
    # Predictions on TRAIN data and R2 score 
    #lm_train_predictions = lm.predict(X_train_processed)
    #lm_train_r2 = r2_score(y_train, lm_train_predictions)
    # Predictions on TEST data and R2 score
    lm_test_predictions = lm.predict(X_test_processed)
    lm_test_r2 = r2_score(y_test, lm_test_predictions)

    
    #KNeigbhours
   
    # Fitting and training the model
    knm = KNeighborsRegressor(n_neighbors=k)
    knm.fit(X_train_processed, y_train)
    # Predictions on TRAIN data and score 
    #kn_train_score = knm.score(X_train_processed, y_train)
    # Predictions on TEST data and score
    kn_test_predictions = knm.predict(X_test_processed)
    kn_test_score = knm.score(X_test_processed, y_test) #error with kn test_predictions, what's been tested it's neighbours from X_test?
    
    
    #MLP Regressor
    
    # Fitting and training the model
    mlp = MLPRegressor(hidden_layer_sizes=(lay_1,lay_2))
    mlp.fit(X_train_processed, y_train)

    # Predictions on TEST data and score
    mlp_test_predictions = mlp.predict(X_test_processed)
    mlp_test_score = r2_score(y_test,mlp_test_predictions)

    
    return lm_test_r2, kn_test_score, k, mlp_test_score

Function arguments (k, lay1, lay2):
- k = Number of neighbours for KNeighbors Regressor
- lay1,lay2 = up to 2 hidden layers of n1 and n2 neurons for MLP Regressor


In [27]:
# Applying the function

lm_test_r2, kn_test_score, k, mlp_test_score = models_train_test(10,50,50)



In [28]:
print("KNeighbour TEST score: {} (k set to {}).\nLinear Regression R2 TEST score: {}.\nMLP Regression R2 TEST score {})".format(kn_test_score, k, lm_test_r2, mlp_test_score))

KNeighbour TEST score: 0.676521431562684 (k set to 10).
Linear Regression R2 TEST score: 0.7719599560601261.
MLP Regression R2 TEST score 0.835306202543953)


In [None]:
# Working in new function

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [38]:
models = ["Linear Regression", "K-Neighbors Regressor", "MLP Regressor"]

In [39]:
def models_train_test(models):
    
    # Regression models
    
    models = ["Linear Regression", "K-Neighbors Regressor", "MLP Regressor"]
    
    # Calling models from models list
    
    for name in models:
        if name == "Linear Regression":
            model = LinearRegression() #linear_model.LinearRegression()
        elif name == "K-Neighbors Regressor":
            model = KNeighborsRegressor()
        elif name == "MLP Regressor":
            model = MLPRegressor()
    
    # Training the model
    
        model.fit(X_train_processed, y_train)
    
    # Generating predictions
    
        test_predictions = model.predict(X_test_processed)
    
    # Testing model's predictons
    
        KN_score = []
        LR_score = []
        MLP_score = []
    
        for name in models:
            if name == "K-Neighbors Regressor":
                KN_test_score = model.score(X_test_processed, y_test)
                KN_score.append(KN_test_score)
                print(KN_score)
            elif name == "Linear Regression":
                LR_R2_score = r2_score(y_test, test_predictions)
                LR_score.append(LR_R2_score)
                print(LR_score)
            elif name == "MLP Regressor":
                MLP_R2_score = r2_score(y_test, test_predictions)
                MLP_score.append(MLP_R2_score)
                print(MLP_score)
    
    #print("KN test score: {}\nLR R2 score: {}\nMLP R2 score: {}".format(KN_test_score, LR_R2_score, MLP_R2_score))
    print("KN test score: {}\nLR R2 score: {}\nMLP R2 score: {}".format(KN_score, LR_score, MLP_score))



In [40]:
models_train_test(models)

[0.7719599560601261]
[0.7719599560601261]
[0.7719599560601261]
[0.6624372853514999]
[0.6624372853514999]
[0.6624372853514999]
[0.8219983485000456]
[0.8219983485000456]
[0.8219983485000456]
KN test score: [0.8219983485000456]
LR R2 score: [0.8219983485000456]
MLP R2 score: [0.8219983485000456]




#### 10. Check and discuss the results.

Models used to predict our target ("Total Claim Amount") are: **Linear Regression (LR), Kneighbor Regressor (KNR) and MLP Regression (MLP)**.

**MLP** shows the highest score (0.82) in its predictions with test data as this method receives feedback on the errors in its results and adjusts its weights accordingly to make more accurate predictions.

**KNR** has been tested with 10 and 25 neighbours, getting scores 0.64 and 0.62, respectively.

**LR** R2 score test score is 0.75.

The overall scores obtained are quite good but still can be improved.

For now, the **function** is simple but it allows to try different alternatives on neighbours (for KNR) and on hidden layers (for MLP). 

I've tried to include some other optional arguments to make it even more customized, but I haven't succeeded using ** kwargs as new parameters.

About **ordinal encoding**, in the previous exercise I've already explained why I haven't applied it on my dataframe, and after compare score results with some teammates that proceeded as suggested in the instructions, we have seen that we actually got better results without encoding them as ordinals.





In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

def train_and_test_models(X, y, models, test_size=0.2, k_neighbors=None, layers=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    results = {}
    for name, model in models:
        if name == "K-Neighbors Regressor" and k_neighbors is not None:
            model.set_params(n_neighbors=k_neighbors)
        elif name == "MLP Regressor" and layers is not None:
            model.set_params(hidden_layer_sizes=layers)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        results[name] = score
        print(f"{name}: {score}")
    return results

In [None]:
models = [("Linear Regression", LinearRegression()),
          ("K-Neighbors Regressor", KNeighborsRegressor()),
          ("MLP Regressor", MLPRegressor())]

#results = train_and_test_models(X, y, models, test_size=0.3, k_neighbors=5, layers=(10,))


In [None]:
# We got X_train_processed, X_test_processed, y_train, y_test

In [None]:
models = [("Linear Regression", LinearRegression()),
          ("K-Neighbors Regressor", KNeighborsRegressor()),
          ("MLP Regressor", MLPRegressor())]

In [None]:
def models_train_test(models):
    
    if name, models in models:
        if name == "Linear Regression":
            model = linear_model.LinearRegression()
        elif name == "K-Neighbors Regressor":
            model = KNeighborsRegressor()
        elif name == "MLP Regressor":
            model = MLPRegressor()
            
    model.fit(X_train_processed, y_train)
    
    test_predictions = model.predict(X_test_processed)
    
    predict_test_r2_score = r2
            
    
    # Models
    lm = linear_model.LinearRegression()
    knm = KNeighborsRegressor(n_neighbors=k)
    mlp = MLPRegressor(hidden_layer_sizes=(lay_1,lay_2))
    
    # Fitting and training the models
    lm.fit(X_train_processed,y_train)
    knm.fit(X_train_processed, y_train)
    mlp.fit(X_train_processed, y_train)
    
    # Predictions on TEST data
    lm_test_predictions = lm.predict(X_test_processed)
    kn_test_predictions = knm.predict(X_test_processed)
    mlp_test_predictions = mlp.predict(X_test_processed)
    
    # Score metrics
    lm_test_r2 = r2_score(y_test, lm_test_predictions)
    kn_test_score = knm.score(X_test_processed, y_test)
    mlp_test_score = r2_score(y_test,mlp_test_predictions)
    
    return lm_test_r2, kn_test_score, k, mlp_test_score