In [1]:
import numpy as np
import pandas as pd

In [2]:
#To fill missing values in numerical column
from sklearn.impute import SimpleImputer

#To fill missing values in nominal categorial column
from sklearn.preprocessing import OneHotEncoder

# To fill missing values in oridinal categorical column
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv("covid_toy.csv")

In [4]:
# df.head()
df.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
37,55,Male,100.0,Mild,Kolkata,No
69,73,Female,103.0,Mild,Delhi,No
53,83,Male,98.0,Mild,Delhi,Yes
35,82,Female,102.0,Strong,Bangalore,No
38,49,Female,101.0,Mild,Delhi,Yes


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
df.cough.value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [8]:
df["gender"].value_counts()

gender
Female    59
Male      41
Name: count, dtype: int64

In [9]:
df["city"].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["has_covid"]),
                                                    df["has_covid"], test_size=0.2)

In [11]:
X_train

Unnamed: 0,age,gender,fever,cough,city
36,38,Female,101.0,Mild,Bangalore
41,82,Male,,Mild,Kolkata
34,74,Male,102.0,Mild,Mumbai
27,33,Female,102.0,Strong,Delhi
83,17,Female,104.0,Mild,Kolkata
...,...,...,...,...,...
37,55,Male,100.0,Mild,Kolkata
72,83,Female,101.0,Mild,Kolkata
2,42,Male,101.0,Mild,Delhi
33,26,Female,98.0,Mild,Kolkata


In [12]:
X_test

Unnamed: 0,age,gender,fever,cough,city
32,34,Female,101.0,Strong,Delhi
12,25,Female,99.0,Strong,Kolkata
69,73,Female,103.0,Mild,Delhi
38,49,Female,101.0,Mild,Delhi
99,10,Female,98.0,Strong,Kolkata
30,15,Male,101.0,Mild,Delhi
52,47,Female,100.0,Strong,Bangalore
43,22,Female,99.0,Mild,Bangalore
74,34,Female,104.0,Strong,Delhi
4,65,Female,101.0,Mild,Mumbai


# Aam Zindagi
`SimpleImputer`
class sklearn.impute.SimpleImputer(*, missing_values=nan, strategy='mean', fill_value=None, copy=True, add_indicator=False, keep_empty_features=False)[source]
```
Univariate imputer for completing missing values with simple strategies.
```
Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value.

In [13]:
# Simple imputer "fever"

si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[["fever"]])

X_test_fever = si.fit_transform(X_test[["fever"]])

X_train_fever.shape

(80, 1)

In [17]:
# Ordinal Encoding "cough" - Ordinal Categorical

oe = OrdinalEncoder(categories=[["Mild", "Strong"]])
X_train_cough = oe.fit_transform(X_train[["cough"]])

# also on test data
X_test_cough = oe.fit_transform(X_test[["cough"]])

X_train_cough.shape

(80, 1)

In [18]:
# One Hot Encoding "gender, city" - Nominal Categorical

ohe = OneHotEncoder(drop="first", sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])

X_test_gender_city = ohe.fit_transform(X_test[["gender", "city"]])

X_train_gender_city.shape

(80, 4)

In [16]:
# Age Extracting

X_train_age = X_train.drop(columns=["gender", "fever", "cough", "city"])

X_test_age = X_test.drop(columns=["gender", "fever", "cough", "city"])

X_train_age.shape

(80, 1)

In [18]:
X_train_transformed = np.concatenate((X_train_age, X_train_fever,X_train_gender_city,
                                      X_train_cough), axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_fever,X_test_gender_city,
                                      X_test_cough), axis=1)

X_train_transformed.shape


(80, 7)

In [19]:
X_test_transformed.shape

(20, 7)

# Mentos Zindagi

In [19]:
from sklearn.compose import ColumnTransformer

In [20]:
transformer = ColumnTransformer(transformers=[
    ("tnf1", SimpleImputer(), ["fever"]),
    ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]),["cough"]),
    ("tnf3", OneHotEncoder(sparse_output=False, drop="first"), ["gender", "city"])
],remainder="passthrough")

In [21]:
transformer.fit_transform(X_train).shape

(80, 7)

In [22]:
transformer.fit_transform(X_test).shape

(20, 7)

# Aamras Zindagi

In [23]:
from sklearn.compose import make_column_transformer

In [24]:
make_col_trans = make_column_transformer(
    (SimpleImputer(), ["fever"]),
    (OrdinalEncoder(categories=[["Mild", "Strong"]]),["cough"]),
    (OneHotEncoder(drop="first", sparse_output=False),["gender", "city"]) 
, remainder="passthrough")

In [25]:
make_col_trans.fit_transform(X_train).shape

(80, 7)

In [26]:
make_col_trans.fit_transform(X_test).shape

(20, 7)

`Difference b/w ColumnTransformer vs make_column_transformer`

There is no such major difference between the two. They both give the same result. as you can see in docs `ColumnTransformer` uses a list of a tuple with a name and `make_column_transformer` is just a tuple without a name. Name given to tuple is helpful when we use Gridsearchcv or Randomsearchcv, the estimator in these can be nested pipelines of transformers and classifier and a regressor if we went to give the param_grid to them, then we can use the name of that tuple. You can see in the StackOverflow question nested pipelines and ColumnTransformer in Gridsearchcv and how naming is helpful. Generally, I use make_columns_transformer if I don't have to use Gridseachcv.

https://scikit-learn.org/1.5/modules/generated/sklearn.compose.make_column_transformer.html
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

`GridSearchCV` and `RandomSearchCV` are both techniques used for hyperparameter tuning in machine learning models, helping to find the best combination of parameters for optimal performance.
```
GridSearchCV
```
- Exhaustively searches through a predefined set of hyperparameter values.
- Evaluates all possible combinations, making it thorough but computationally expensive.
- Best suited when the number of hyperparameters is small and the search space is manageable.

```
RandomSearchCV
```
- Randomly selects a subset of hyperparameter combinations to evaluate.
- More efficient than GridSearchCV when the search space is large.
- Can discover good hyperparameters faster, though it may miss the absolute best combination.

Both methods use cross-validation to ensure robust model performance. If you're working with a large dataset and many hyperparameters, RandomSearchCV is often preferred for efficiency. If you want a more exhaustive search, GridSearchCV is the way to go.


GridSearchCV Example

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Initialize and run GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'kernel': 'linear'}
Best Accuracy: 0.9583333333333334


RandomSearchCV Example

In [28]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define parameter grid for random search
param_dist = {
    'C': np.linspace(0.1, 10, 10),
    'kernel': ['linear', 'rbf']
}

# Initialize and run RandomizedSearchCV
random_search = RandomizedSearchCV(SVC(), param_dist, cv=5, scoring='accuracy', n_iter=5, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)

Best Parameters: {'kernel': 'linear', 'C': 4.5}
Best Accuracy: 0.9666666666666668
