In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("covid_toy.csv")

In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [6]:
from sklearn.impute import SimpleImputer

In [7]:
si = SimpleImputer()

In [8]:
df['fever'] = si.fit_transform(df[['fever']])

In [9]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [11]:
df['gender'] = lb.fit_transform(df['gender'])
df['city'] = lb.fit_transform(df['city'])
df['cough'] = lb.fit_transform(df['cough'])
df['has_covid'] = lb.fit_transform(df['has_covid'])

In [12]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1
2,42,1,101.0,0,1,0
3,31,0,98.0,0,2,0
4,65,0,101.0,0,3,0


In [13]:
# drop the has covid column
x= df.drop(columns = ['has_covid'])
y = df['has_covid']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2 , random_state =42)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression()

In [18]:

lr.fit(x_train, y_train)

In [19]:
y_pred = lr.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
accuracy_score(y_test , y_pred)

0.75

## Hyperparameter_tuning

In [22]:
param_grid = [
    {'penalty' : ['l1','l2','elasticnet','none'],
     'C':np.logspace(-4,4,20),
     'solver' : ['lbfgs' , 'newton-cg' ,'liblinear' , 'sag', 'saga'],
     'max_iter' : [100,1000,2500,5000]}
]

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
clf = GridSearchCV(lr , param_grid= param_grid , cv=3 , verbose=True , n_jobs=-1)

In [25]:
best_clf = clf.fit(x_train, y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


3120 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  F

In [26]:
best_clf.best_estimator_

In [27]:
print("accuracy " , f"{best_clf.score(x_train, y_train):.3f}")

accuracy  0.550


In [28]:
#  titanic 

In [29]:
df = pd.read_csv("titanic.csv")

In [30]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [32]:
df = df.drop(columns = ['Name' ,'PassengerId' , 'Ticket' , 'Cabin'])

In [33]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [34]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [35]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df['Sex'] = lb.fit_transform(df['Sex'])
df['Embarked'] = lb.fit_transform(df['Embarked'])

In [36]:
from sklearn.impute import SimpleImputer
si=SimpleImputer()
df['Age']=si.fit_transform(df[['Age']])

In [37]:
df['Fare']=si.fit_transform(df[['Fare']])

In [38]:
df.head(3)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,34.5,0,0,7.8292,1
1,1,3,0,47.0,1,0,7.0,2
2,0,2,1,62.0,0,0,9.6875,1


In [39]:
from sklearn.model_selection import train_test_split
x=df.drop(columns=['Survived'])
y=df['Survived']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [40]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [41]:
y_preidict=lr.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_preidict)
# accuracy 

1.0

In [42]:
# fit accuracy 
param_grid=[
    {'penalty':['l1','l2','elasticnet','none'],
     'C': np.logspace(-4,4,20),
     'solver':['lbfs','newton-cg','liblinear','sag','saga'],
     'max_iter':[100,1000,2500,5000]
     }
]

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
clf=GridSearchCV(lr,param_grid=param_grid, cv=3, verbose=True, n_jobs=-1)


In [45]:
best_clf =clf.fit(x_train,y_train)


Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


3360 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils

In [46]:
best_clf.best_estimator_

In [47]:
print(f'Accuracy -: {best_clf.score(x_train,y_train):.3f}')

Accuracy -: 1.000


In [48]:
# perform this on ldata set on class room
# tarfet  = purchase
# logistic 
# 

In [49]:
df = pd.read_csv('Social_Network_Ads.csv')

In [50]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [51]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [52]:
from sklearn.preprocessing import LabelEncoder

In [53]:
lb = LabelEncoder()

In [54]:
df['Gender']  = lb.fit_transform(df[['Gender']])
df['Age'] = lb.fit_transform(df[['Age']])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [55]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,1,19000,0
1,15810944,1,17,20000,0
2,15668575,0,8,43000,0
3,15603246,0,9,57000,0
4,15804002,1,1,76000,0


In [56]:

# drop the has covid column
x= df.drop(columns = ['Purchased'])
y = df['Purchased']

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2 , random_state =42)

In [59]:
from sklearn.linear_model import LogisticRegression

In [60]:
lr = LogisticRegression()

In [61]:
lr.fit(x_train, y_train)

In [62]:
y_pred = lr.predict(x_test)

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
accuracy_score(y_test , y_pred)

0.8875

In [65]:
param_grid = [
    {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
     'C': np.logspace(-4, 4, 20),
     'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
     'max_iter': [100, 1000, 2500, 5000]}
]

In [66]:
from sklearn.model_selection import GridSearchCV

In [67]:
clf = GridSearchCV(lr , param_grid= param_grid , cv=3 , verbose=True , n_jobs=-1)

In [68]:
best_clf = clf.fit(x_train, y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


3120 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\AnacondaNavfile\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  F

In [69]:
best_clf.best_estimator_

In [70]:
print(f'Accuracy -: {best_clf.score(x_train,y_train):.3f}')

Accuracy -: 0.831
