# Lab | comparing regression models

#### 1. Load the dataset and explore the variables.

In [1]:
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
import earthpy as et
import numpy as np

In [2]:
df = pd.read_csv (r'winequality-red.csv',sep=';') # import file

#### In this final lab of this week, you will model your data using the different models Linear Regression and KNN. 
#### The purpose is to get used to running at least two candidate analysis models 
#### in one notebook and to observe how the choice of model ultimately impacts the outcome / results.

#### Following EDA, basic cleaning steps, feature engineering and 
#### preprocessing, separate the data into the label and the train test rows

#### -- explore dataframe --

In [3]:
df # view dataframe

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


#### -- basic cleaning steps --

In [4]:
df.info() # checking for nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
df.duplicated(subset=None, keep='first') # checking for duplicates

0       False
1       False
2       False
3       False
4        True
        ...  
1594    False
1595    False
1596     True
1597    False
1598    False
Length: 1599, dtype: bool

In [6]:
df.duplicated().sum() # count duplicates

240

In [7]:
df[df.duplicated(keep=False)] # see duplicates

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
9,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,5
11,7.5,0.500,0.36,6.1,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,5
22,7.9,0.430,0.21,1.6,0.106,10.0,37.0,0.99660,3.17,0.91,9.5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1567,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5
1579,6.2,0.560,0.09,1.7,0.053,24.0,32.0,0.99402,3.54,0.60,11.3,5
1581,6.2,0.560,0.09,1.7,0.053,24.0,32.0,0.99402,3.54,0.60,11.3,5
1592,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6


In [8]:
df = df.drop_duplicates()# drop duplicate and replaces the data frame

In [9]:
df.reset_index(drop=True, inplace=True) # resetting index after dropping duplicates

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1359 non-null   float64
 1   volatile acidity      1359 non-null   float64
 2   citric acid           1359 non-null   float64
 3   residual sugar        1359 non-null   float64
 4   chlorides             1359 non-null   float64
 5   free sulfur dioxide   1359 non-null   float64
 6   total sulfur dioxide  1359 non-null   float64
 7   density               1359 non-null   float64
 8   pH                    1359 non-null   float64
 9   sulphates             1359 non-null   float64
 10  alcohol               1359 non-null   float64
 11  quality               1359 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 127.5 KB


In [11]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


#### We will try to predict variable Quality using a linear regression on all variables 

In [12]:
# define X and y

In [13]:
X=df.drop(columns=['quality'])
y=df['quality']

#### Try a simple linear regression model with all the data 
#### to see whether we are getting good results on the Test data set. 

### import and apply model

In [14]:
from sklearn.linear_model import LinearRegression as Lin

In [15]:
from sklearn.model_selection import train_test_split as tts

In [16]:
# define the model
linreg=Lin()

In [17]:
# split the data into TT
X_train,X_test,y_train,y_test=tts(X,y,test_size=0.3,random_state=40)

In [18]:
# train the model on train
linreg.fit(X_train,y_train)

LinearRegression()

In [19]:
# make some predictions on test
y_pred=linreg.predict(X_test)

### compare and evaluate model

In [20]:
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [21]:
r2(y_test,y_pred) # you want to be as close to 1 as possible

0.3323269588811536

In [22]:
mse(y_test,y_pred)

0.4643517115797475

In [23]:
mae(y_test,y_pred) # we can predict the quality by 0.5 point difference

0.5339552554735167

### Now define a function that takes a list of models and train (and tests) them 
### so we can try a lot of them without repeating code.

### Use the function to runLinearRegressor and KNeighborsRegressor. 
### You can optionally check include other regression models for this task! 
### Such as the Decision Tree Regressor , Bayesian Linear Regressor, SVR, 

### Create a pandas data frame which contains your actual labels and your value predictions 
### for each model- this can be on Test or your full X

### Create a second pandas data frame containing the name of the model 
### and the metrics for each model for train and test accuracy

# EITHER

### Export both data frames as csvs, then connect to the csvs in Tableau and using this data, compile one or 
### two charts in Tableau which summarise your results from the different models and submit the tableau public url as your lab 

# OR 

### Visualise these data frames to summarise your results using seaborn/matplotlib/plotly/other in Python 
### and submit the url of the jupyter notebook as your lab