In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('covid_toy.csv')
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [9]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [10]:
df['city'].value_counts()


city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [14]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [18]:
df.iloc[:,-1]

0      No
1     Yes
2      No
3      No
4      No
     ... 
95     No
96    Yes
97     No
98     No
99    Yes
Name: has_covid, Length: 100, dtype: object

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(['has_covid'],axis=1),df.iloc[:,-1],random_state=42,test_size=0.2)

In [22]:
mask = df['fever'].isnull()==True
df['fever'][mask]

5    NaN
7    NaN
10   NaN
19   NaN
25   NaN
29   NaN
41   NaN
56   NaN
91   NaN
94   NaN
Name: fever, dtype: float64

In [25]:
X_train

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi
...,...,...,...,...,...
60,24,Female,102.0,Strong,Bangalore
71,75,Female,104.0,Strong,Delhi
14,51,Male,104.0,Mild,Bangalore
92,82,Female,102.0,Strong,Kolkata


In [29]:
X_train['fever'].ndim

1

In [44]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
dabbu = imp_mean.fit_transform(X_train[['fever']])
pd.DataFrame(dabbu,columns=['fever'])


Unnamed: 0,fever
0,101.0
1,100.0
2,100.0
3,100.0
4,103.0
...,...
75,102.0
76,104.0
77,104.0
78,102.0


In [38]:
type(dabbu)

numpy.ndarray

In [39]:
np.isnan(dabbu).any()

np.False_

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [34]:
df['fever'].value_counts()

fever
101.0    17
98.0     17
104.0    14
100.0    13
99.0     10
102.0    10
103.0     9
Name: count, dtype: int64

arr1 = np.array([1,2,3,4,5,6])


The line of code df.isnull().sum() is a common operation in data preprocessing, particularly when dealing with missing data in a DataFrame. This line performs two main actions: detecting missing values and then summing them up.

First, the isnull() method is called on the DataFrame df. This method returns a DataFrame of the same shape as df, but with boolean values indicating the presence of missing values (NA values). Each element in this boolean DataFrame is True if the corresponding element in df is missing, and False otherwise.

Next, the sum() method is called on this boolean DataFrame. The sum() method calculates the sum of True values for each column, effectively counting the number of missing values in each column of the original DataFrame df.

In [35]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

The line of code [`df.drop(columns=['has_covid'])`](command:_github.copilot.openSymbolFromReferences?%5B%22df.drop(columns%3D%5B'has_covid'%5D)%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A0%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") is a method call on a DataFrame object named [`df`](command:_github.copilot.openSymbolFromReferences?%5B%22df%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A0%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition"). This method is used to remove the column named 'has_covid' from the DataFrame.

The [`drop`](command:_github.copilot.openSymbolFromReferences?%5B%22drop%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A0%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") method in pandas is versatile and can be used to drop rows or columns from a DataFrame. In this case, the [`columns`](command:_github.copilot.openSymbolFromReferences?%5B%22columns%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23W5sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22W5sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A57%7D%7D%5D%5D "Go to definition") parameter is specified with a list containing the single column name 'has_covid'. This tells pandas to remove this column from the DataFrame.

By default, the [`drop`](command:_github.copilot.openSymbolFromReferences?%5B%22drop%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fpandas%2Fcore%2Fframe.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A0%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") method returns a new DataFrame with the specified column removed, leaving the original DataFrame unchanged. If you want to modify the original DataFrame in place, you would need to set the `inplace` parameter to `True`.

In summary, this line of code is used to remove the 'has_covid' column from the DataFrame [`df`](command:_github.copilot.openSymbolFromReferences?%5B%22df%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A0%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition"). This is useful for data preprocessing, especially when certain columns are no longer needed for analysis or modeling.

In [36]:
import numpy as np
from sklearn.model_selection import train_test_split

# Maan lo ye humara data hai
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 1, 0, 1, 1])

# Ab hum data ko split karenge
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train:", X_train)
print("X_test:", X_test)
print("y_train:", y_train)
print("y_test:", y_test)

X_train: [[ 9 10]
 [ 5  6]
 [ 1  2]
 [ 7  8]]
X_test: [[3 4]]
y_train: [1 0 0 1]
y_test: [1]


In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [38]:
X_train

Unnamed: 0,age,gender,fever,cough,city
61,81,Female,98.0,Strong,Mumbai
39,50,Female,103.0,Mild,Kolkata
37,55,Male,100.0,Mild,Kolkata
84,69,Female,98.0,Strong,Mumbai
9,64,Female,101.0,Mild,Delhi
...,...,...,...,...,...
10,75,Female,,Mild,Delhi
57,49,Female,99.0,Strong,Bangalore
80,14,Female,99.0,Mild,Mumbai
0,60,Male,103.0,Mild,Kolkata


## 1. Aam Zindagi

The line of code [`si = SimpleImputer()`](command:_github.copilot.openSymbolFromReferences?%5B%22si%20%3D%20SimpleImputer()%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") is creating an instance of the [`SimpleImputer`](command:_github.copilot.openSymbolFromReferences?%5B%22SimpleImputer%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A37%2C%22character%22%3A6%7D%7D%2C%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A5%7D%7D%5D%5D "Go to definition") class. The [`SimpleImputer`](command:_github.copilot.openSymbolFromReferences?%5B%22SimpleImputer%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A37%2C%22character%22%3A6%7D%7D%2C%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A5%7D%7D%5D%5D "Go to definition") class is designed to handle missing data in a dataset by replacing missing values with a specified strategy. This is a common preprocessing step in machine learning workflows to ensure that the dataset is complete and can be used effectively by machine learning algorithms.

By default, the [`SimpleImputer`](command:_github.copilot.openSymbolFromReferences?%5B%22SimpleImputer%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A37%2C%22character%22%3A6%7D%7D%2C%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A5%7D%7D%5D%5D "Go to definition") class replaces missing values with the mean of the column in which the missing value is located. However, it can be configured to use other strategies such as replacing missing values with the median, the most frequent value, or a constant value specified by the user.

The [`SimpleImputer`](command:_github.copilot.openSymbolFromReferences?%5B%22SimpleImputer%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A37%2C%22character%22%3A6%7D%7D%2C%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A5%7D%7D%5D%5D "Go to definition") class inherits from `_BaseImputer` and includes several attributes and methods. Key attributes include `feature_names_in_`, which stores the names of the features seen during the fit, `n_features_in_`, which indicates the number of features seen during the fit, and `statistics_`, which holds the statistics used to impute the missing values. The class also has methods such as `fit`, which learns the imputation strategy from the data, `transform`, which applies the learned imputation strategy to the data, and `inverse_transform`, which can reverse the imputation process if needed.

In the provided code, [`si`](command:_github.copilot.openSymbolFromReferences?%5B%22si%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") is now an instance of [`SimpleImputer`](command:_github.copilot.openSymbolFromReferences?%5B%22SimpleImputer%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22external%22%3A%22file%3A%2F%2F%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2F.vscode%2Fextensions%2Fms-python.vscode-pylance-2024.8.2%2Fdist%2Fbundled%2Fstubs%2Fsklearn%2Fimpute%2F_base.pyi%22%2C%22scheme%22%3A%22file%22%7D%2C%22pos%22%3A%7B%22line%22%3A37%2C%22character%22%3A6%7D%7D%2C%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A5%7D%7D%5D%5D "Go to definition") with default settings. This instance can be used to fit and transform a dataset to handle missing values according to the specified or default strategy. For example, you can call [`si.fit(X_train)`](command:_github.copilot.openSymbolFromReferences?%5B%22si.fit(X_train)%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") to learn the imputation strategy from the training data and then [`si.transform(X_train)`](command:_github.copilot.openSymbolFromReferences?%5B%22si.transform(X_train)%22%2C%5B%7B%22uri%22%3A%7B%22%24mid%22%3A1%2C%22fsPath%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22external%22%3A%22vscode-notebook-cell%3A%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%23X11sZmlsZQ%253D%253D%22%2C%22path%22%3A%22%2FUsers%2Fp0a0221%2FDownloads%2Fcampusx-notes%2F100-days-of-machine-learning%2Fday28-column-transformer%2Fday28.ipynb%22%2C%22scheme%22%3A%22vscode-notebook-cell%22%2C%22fragment%22%3A%22X11sZmlsZQ%3D%3D%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A0%7D%7D%5D%5D "Go to definition") to apply the imputation to the training data.

In [39]:
# adding simple imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])
                                 
X_train_fever.shape

(80, 1)

In [40]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [53]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first')
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape



(80, 4)

In [56]:
X_train_gender_city_dense = X_train_gender_city.toarray()
X_test_gender_city_dense = X_test_gender_city.toarray()



Yah code X_train data se gender, fever, cough, aur city columns ko hata raha hai. Fir jo baaki data bacha, usko .values ke through ek Numpy array mein convert kar raha hai.

Matlab:

drop(columns=['gender','fever','cough','city']) – In columns ko data se remove kar diya.
.values – Baaki ka data sirf values ke form mein ek Numpy array mein store ho gaya.
Yah step useful hota hai jab hum model ko train karte waqt sirf kuch specific features use karna chahte hain aur baaki columns ko ignore karna chahte hain.

In [42]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [43]:
X_train_cough.shape

(80, 1)

In [59]:
X_train_fever = X_train_fever.reshape(-1, 1)
X_test_fever = X_test_fever.reshape(-1, 1)

X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city_dense, X_train_cough), axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city_dense, X_test_cough), axis=1)

X_train_transformed.shape

(80, 7)

## Mentos Zindagi

In [60]:
from sklearn.compose import ColumnTransformer

In [62]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(drop='first'),['gender','city'])
],remainder='passthrough')

In [63]:
transformer.fit_transform(X_train).shape

(80, 7)

In [64]:
transformer.transform(X_test).shape

(20, 7)