In [24]:
from io import StringIO
import sys
import pandas as pd
import numpy as np

In [10]:
scv_data=\
'''
A,B,C,D
1,2,3,4
5,6,,8
10,11,12
'''
df=pd.read_csv(StringIO(scv_data))
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [11]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [12]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [13]:
# drop row null exist
df.dropna(axis=0) # axis=0 >> row

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0


In [14]:
# drop column null exist
df.dropna(axis=1) # axis=1 >> column

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [16]:
## drop row the include null value in column C
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
2,10,11,12.0,


In [17]:
df.drop(columns=['C'])

Unnamed: 0,A,B,D
0,1,2,4.0
1,5,6,8.0
2,10,11,


In [18]:
df.drop(columns=['C'],inplace=True) #  inplace=True make change in orignal data

In [19]:
df

Unnamed: 0,A,B,D
0,1,2,4.0
1,5,6,8.0
2,10,11,


## Imputing missing values

In [20]:
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:
# csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [22]:
from sklearn.impute import SimpleImputer


## SimpleImputer
is a class in scikit-learn that provides basic strategies for imputing missing values in a dataset. Imputation refers to the process of replacing missing values with some estimated values. Here are the common strategies you can use with SimpleImputer:
### Mean Imputation:

Strategy: Replace missing values with the mean of the non-missing values in the column.
Usage:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)

### Median Imputation:

Strategy: Replace missing values with the median of the non-missing values in the column.
Usage:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

X_imputed = imputer.fit_transform(X)

### Most Frequent Imputation:

Strategy: Replace missing values with the most frequent (mode) value in the column.
Usage:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')

X_imputed = imputer.fit_transform(X)

### Constant Imputation:

Strategy: Replace missing values with a specified constant value.
Usage:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value=0)

X_imputed = imputer.fit_transform(X)
______________________________________
### Choosing a Strategy:

- Use mean or median imputation when dealing with numerical features.
- Use most frequent imputation when dealing with categorical features.
- Constant imputation can be useful when you have domain knowledge that suggests a specific constant should be used.

In [25]:
simple_imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
simple_imputer=simple_imputer.fit(df.values)
X_imp=simple_imputer.transform(df.values)
X_imp

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [26]:
## mean of each column
df_means=df.mean()
df_means

A    5.333333
B    6.333333
C    7.500000
D    6.000000
dtype: float64

In [27]:
df.fillna(df_means)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [28]:
df_median=df.median()
df.fillna(df_median)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [30]:
df_max=df.max()
df.fillna(df_max)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,12.0,8.0
2,10.0,11.0,12.0,8.0


In [31]:
df_min=df.min()
df.fillna(df_min)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,3.0,8.0
2,10.0,11.0,12.0,4.0
