# Ridge and Lasso Regression

## Import the required Libraries

In [1]:
import numpy as np
import pandas as pd

## Import the dataset

In [2]:
df=pd.read_csv('50_Startups.csv')

In [3]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.20,136897.80,471784.10,New York,192261.83
1,162597.70,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
...,...,...,...,...,...
103,119943.24,156547.42,256512.92,Florida,132602.65
104,114523.61,122616.84,261776.23,New York,129917.04
105,78013.11,121597.55,264346.06,California,126992.93
106,94657.16,145077.58,282574.31,New York,125370.37


In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
df.shape

(108, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        108 non-null    float64
 1   Administration   108 non-null    float64
 2   Marketing Spend  108 non-null    float64
 3   State            108 non-null    object 
 4   Profit           108 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.3+ KB


## Label Encoding

In [7]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder


In [8]:
le=LabelEncoder()

In [9]:
df['State']=le.fit_transform(df['State'])

In [10]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


## Split data into dependent and independent data

In [11]:
x=df.drop(columns=['Profit'])

In [12]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


In [13]:
y=df.Profit

In [14]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

## Normalization using MinMaxScaler

In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
scale=MinMaxScaler()

In [17]:
scaled_x=pd.DataFrame(scale.fit_transform(x))

In [18]:
scaled_x

Unnamed: 0,0,1,2,3
0,1.000000,0.651744,1.000000,1.0
1,0.983359,0.761972,0.940893,0.0
2,0.927985,0.379579,0.864664,0.5
3,0.873136,0.512998,0.812235,1.0
4,0.859438,0.305328,0.776136,0.5
...,...,...,...,...
103,0.725394,0.801327,0.543708,0.5
104,0.692617,0.543030,0.554864,1.0
105,0.471808,0.535270,0.560312,0.0
106,0.572468,0.714013,0.598948,1.0


## Split data into training and test data

In [19]:
#Train and Test Split
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(scaled_x,y,test_size=0.2,random_state=0)

In [21]:
x_train.shape

(86, 4)

In [22]:
x_test.shape

(22, 4)

## Import Ridge and Lasso

In [23]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [24]:
r=Ridge()
l=Lasso()

In [25]:
r.fit(x_train,y_train)

In [26]:
l.fit(x_train,y_train)

In [27]:
pred1=r.predict(x_test)
pred2=l.predict(x_test)

In [28]:
pred1

array([ 54556.32416702, 130017.92166782,  84687.15947095, 173295.2223158 ,
       108917.94957822, 128735.89224253, 128736.35934265, 155951.19177423,
       117814.48562718,  52712.59507338, 102790.3781561 , 119096.2726001 ,
        54556.32416702, 124206.72612243,  88379.01243395, 126261.35613731,
       126261.35613731,  98802.1865801 ,  74278.88209886, 141546.67661999,
       145564.21281487, 150251.73759042])

In [29]:
pred2

array([ 48384.86814735, 134845.52354938,  76486.64641608, 181551.13594979,
       112961.07382208, 134236.64101991, 129218.98004997, 160017.16104325,
       116754.23112994,  46273.04713164, 102272.49339834, 115567.13437352,
        48384.86814735, 119116.48630482,  88593.22703248, 127104.80005829,
       127104.80005829,  90948.41312188,  58678.78647171, 146299.80323437,
       149413.8490298 , 152502.10158276])

## Metrics to find model accuracy

In [30]:
from sklearn import metrics

### MSE (Mean Square Error)

In [31]:
print(metrics.mean_squared_error(y_test,pred1))
print(metrics.mean_squared_error(y_test,pred2))

117186385.76630396
96005734.13154325


### RMSE(Root Mean Square Error)

In [32]:
print(np.sqrt(metrics.mean_squared_error(y_test,pred1)))
print(np.sqrt(metrics.mean_squared_error(y_test,pred2)))

10825.266082933202
9798.251585438253


### R Squared

In [33]:
print(metrics.r2_score(y_test,pred1))
print(metrics.r2_score(y_test,pred2))

0.9095565216441845
0.9259035724996549


As r2 Score is higher for lasso regression we will use Lasso Regression ML Model for Deployment.

# Logistic regression

## Import the required Libraries

In [34]:
import numpy as np
import pandas as pd

## Import the dataset

In [35]:
df=pd.read_csv('Social_Network_Ads.csv')

In [36]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [37]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [38]:
df.shape

(400, 5)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


## Label Encoding

In [40]:
from sklearn.preprocessing import LabelEncoder

In [41]:
le=LabelEncoder()

In [42]:
df['Gender']=le.fit_transform(df['Gender'])

In [43]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


## Split data into dependent and independent data

In [44]:
x=df.drop(columns=['Purchased'])

In [45]:
x.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,1,19,19000
1,15810944,1,35,20000
2,15668575,0,26,43000
3,15603246,0,27,57000
4,15804002,1,19,76000


In [46]:
y=df.Purchased

In [47]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

## Normalization using MinMaxScaler

In [48]:
from sklearn.preprocessing import MinMaxScaler

In [49]:
scale=MinMaxScaler()

In [50]:
scaled_x=pd.DataFrame(scale.fit_transform(x))

In [51]:
scaled_x

Unnamed: 0,0,1,2,3
0,0.232636,1.0,0.023810,0.029630
1,0.982732,1.0,0.404762,0.037037
2,0.409926,0.0,0.190476,0.207407
3,0.147083,0.0,0.214286,0.311111
4,0.954801,1.0,0.023810,0.451852
...,...,...,...,...
395,0.503623,0.0,0.666667,0.192593
396,0.560787,1.0,0.785714,0.059259
397,0.352477,0.0,0.761905,0.037037
398,0.757720,1.0,0.428571,0.133333


## Split data into training and test data

In [52]:
#Train and Test Split
from sklearn.model_selection import train_test_split

In [53]:
x_train,x_test,y_train,y_test=train_test_split(scaled_x,y,test_size=0.2,random_state=0)

In [54]:
x_train.shape

(320, 4)

In [55]:
x_test.shape

(80, 4)

## Import Ridge and Lasso

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
logreg = LogisticRegression()

In [58]:
logreg.fit(x_train, y_train)

In [59]:
pred=logreg.predict(x_test)

In [60]:
pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1])

## Metrics to find model accuracy

In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Acccuracy score

In [66]:
print(accuracy_score(y_test,pred))

0.9375


### Confusion matrix

In [68]:
print(confusion_matrix(y_test,pred))

[[58  0]
 [ 5 17]]


### Classification report

In [70]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        58
           1       1.00      0.77      0.87        22

    accuracy                           0.94        80
   macro avg       0.96      0.89      0.92        80
weighted avg       0.94      0.94      0.93        80

