## Agenda:

- Discussing the case studies of Linear Regression on different datasets

# Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [2]:
data = pd.read_csv('cereal.csv')
data.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [3]:
data.shape

(77, 16)

# Data Preprocessing 

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      77 non-null     object 
 1   mfr       77 non-null     object 
 2   type      77 non-null     object 
 3   calories  77 non-null     int64  
 4   protein   77 non-null     int64  
 5   fat       77 non-null     int64  
 6   sodium    77 non-null     int64  
 7   fiber     77 non-null     float64
 8   carbo     77 non-null     float64
 9   sugars    77 non-null     int64  
 10  potass    77 non-null     int64  
 11  vitamins  77 non-null     int64  
 12  shelf     77 non-null     int64  
 13  weight    77 non-null     float64
 14  cups      77 non-null     float64
 15  rating    77 non-null     float64
dtypes: float64(5), int64(8), object(3)
memory usage: 9.8+ KB


In [5]:
data.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


In [6]:
data.isnull().sum()

name        0
mfr         0
type        0
calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
shelf       0
weight      0
cups        0
rating      0
dtype: int64

In [7]:
data['name'].unique()

array(['100% Bran', '100% Natural Bran', 'All-Bran',
       'All-Bran with Extra Fiber', 'Almond Delight',
       'Apple Cinnamon Cheerios', 'Apple Jacks', 'Basic 4', 'Bran Chex',
       'Bran Flakes', "Cap'n'Crunch", 'Cheerios', 'Cinnamon Toast Crunch',
       'Clusters', 'Cocoa Puffs', 'Corn Chex', 'Corn Flakes', 'Corn Pops',
       'Count Chocula', "Cracklin' Oat Bran", 'Cream of Wheat (Quick)',
       'Crispix', 'Crispy Wheat & Raisins', 'Double Chex', 'Froot Loops',
       'Frosted Flakes', 'Frosted Mini-Wheats',
       'Fruit & Fibre Dates; Walnuts; and Oats', 'Fruitful Bran',
       'Fruity Pebbles', 'Golden Crisp', 'Golden Grahams',
       'Grape Nuts Flakes', 'Grape-Nuts', 'Great Grains Pecan',
       'Honey Graham Ohs', 'Honey Nut Cheerios', 'Honey-comb',
       'Just Right Crunchy  Nuggets', 'Just Right Fruit & Nut', 'Kix',
       'Life', 'Lucky Charms', 'Maypo',
       'Muesli Raisins; Dates; & Almonds',
       'Muesli Raisins; Peaches; & Pecans', 'Mueslix Crispy Blend',
  

## Since the name column is not useful for predicting the rating of cereals hence it can safely dropped from the data

In [8]:
data = data.drop('name', axis = 1) 
data.head()

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [9]:
data['mfr'].unique()

array(['N', 'Q', 'K', 'R', 'G', 'P', 'A'], dtype=object)

In [10]:
data['type'].unique()

array(['C', 'H'], dtype=object)

## Looking at the mfr and type column we get a sense that the data in those columns is not in a proper order hence apply one hot encoding

In [11]:
data_ohe = pd.get_dummies(data[['mfr', 'type']])
data_ohe

Unnamed: 0,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0
2,0,0,1,0,0,0,0,1,0
3,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
72,0,1,0,0,0,0,0,1,0
73,0,1,0,0,0,0,0,1,0
74,0,0,0,0,0,0,1,1,0
75,0,1,0,0,0,0,0,1,0


In [12]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,...,rating,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
0,N,C,70,4,1,130,10.0,5.0,6,280,...,68.402973,0,0,0,1,0,0,0,1,0
1,Q,C,120,3,5,15,2.0,8.0,8,135,...,33.983679,0,0,0,0,0,1,0,1,0
2,K,C,70,4,1,260,9.0,7.0,5,320,...,59.425505,0,0,1,0,0,0,0,1,0
3,K,C,50,4,0,140,14.0,8.0,0,330,...,93.704912,0,0,1,0,0,0,0,1,0
4,R,C,110,2,2,200,1.0,14.0,8,-1,...,34.384843,0,0,0,0,0,0,1,1,0


In [13]:
data = data.drop(['mfr', 'type'], axis = 1)
data.head()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,...,rating,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
0,70,4,1,130,10.0,5.0,6,280,25,3,...,68.402973,0,0,0,1,0,0,0,1,0
1,120,3,5,15,2.0,8.0,8,135,0,3,...,33.983679,0,0,0,0,0,1,0,1,0
2,70,4,1,260,9.0,7.0,5,320,25,3,...,59.425505,0,0,1,0,0,0,0,1,0
3,50,4,0,140,14.0,8.0,0,330,25,3,...,93.704912,0,0,1,0,0,0,0,1,0
4,110,2,2,200,1.0,14.0,8,-1,25,3,...,34.384843,0,0,0,0,0,0,1,1,0


## Feature Scaling on the data

In [14]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calories  77 non-null     int64  
 1   protein   77 non-null     int64  
 2   fat       77 non-null     int64  
 3   sodium    77 non-null     int64  
 4   fiber     77 non-null     float64
 5   carbo     77 non-null     float64
 6   sugars    77 non-null     int64  
 7   potass    77 non-null     int64  
 8   vitamins  77 non-null     int64  
 9   shelf     77 non-null     int64  
 10  weight    77 non-null     float64
 11  cups      77 non-null     float64
 12  rating    77 non-null     float64
 13  mfr_A     77 non-null     uint8  
 14  mfr_G     77 non-null     uint8  
 15  mfr_K     77 non-null     uint8  
 16  mfr_N     77 non-null     uint8  
 17  mfr_P     77 non-null     uint8  
 18  mfr_Q     77 non-null     uint8  
 19  mfr_R     77 non-null     uint8  
 20  type_C    77 non-null     uint8  


In [30]:
num_cols = ['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'shelf', 'weight', 'cups', 'rating']
num_cols

['calories',
 'protein',
 'fat',
 'sodium',
 'fiber',
 'carbo',
 'sugars',
 'potass',
 'vitamins',
 'shelf',
 'weight',
 'cups',
 'rating']

In [27]:
numerical_cols = data.select_dtypes(['int', 'float'])
numerical_cols.columns

Index(['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars',
       'potass', 'vitamins', 'shelf', 'weight', 'cups', 'rating'],
      dtype='object')

In [29]:
num_cols = numerical_cols.columns.tolist()
num_cols

['calories',
 'protein',
 'fat',
 'sodium',
 'fiber',
 'carbo',
 'sugars',
 'potass',
 'vitamins',
 'shelf',
 'weight',
 'cups',
 'rating']

In [33]:
data[num_cols] = rs.fit_transform(data[num_cols])
data.head()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,...,rating,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
0,-4.0,1.0,0.0,-0.625,4.0,-1.8,-0.125,2.375,0.0,0.5,...,1.586173,0,0,0,1,0,0,0,1,0
1,1.0,0.0,2.0,-2.0625,0.0,-1.2,0.125,0.5625,-25.0,0.5,...,-0.363454,0,0,0,0,0,1,0,1,0
2,-4.0,1.0,0.0,1.0,3.5,-1.4,-0.25,2.875,0.0,0.5,...,1.077658,0,0,1,0,0,0,0,1,0
3,-6.0,1.0,-0.5,-0.5,6.0,-1.2,-0.875,3.0,0.0,0.5,...,3.019361,0,0,1,0,0,0,0,1,0
4,0.0,-1.0,0.5,0.25,-0.5,0.0,0.125,-1.1375,0.0,0.5,...,-0.340731,0,0,0,0,0,0,1,1,0


## Seperate the data into X and y

In [35]:
X = data.drop('rating', axis = 1)
y = data['rating']

## Split the data into train test split

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Apply Linear Regression on X_train and y_train

In [38]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [39]:
lr.fit(X_train, y_train)

## Perform predictions

In [41]:
y_pred = lr.predict(X_test)
y_pred

array([-0.5933922 ,  0.53172531, -0.03946962,  1.15302821,  0.30652639,
        1.01646259,  1.07417387,  0.73471058, -0.354613  , -0.08838996,
        0.02927554,  0.8458526 ,  3.01936132, -0.77407171,  0.81853773,
       -0.19041518, -0.23926252,  0.27913124,  1.0899685 , -0.21957966])

## Perform Evaluation

In [42]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9999999999999993