In [2]:
import pandas as pd
import numpy as np

## Removing NA values

In [3]:

df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8], 'C': [9, 10, 11, 12]})

df.dropna(inplace=True)

df


Unnamed: 0,A,B,C
0,1.0,5.0,9
3,4.0,8.0,12


## Filling missing values with mean

In [5]:
df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8], 'C': [9, 10, 11, 12]})

mean_value = df['A'].mean()

df['A'].fillna(mean_value, inplace=True)

df

Unnamed: 0,A,B,C
0,1.0,5.0,9
1,2.0,,10
2,2.333333,,11
3,4.0,8.0,12


## Replacing values in a column

In [6]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': [9, 10, 11, 12]})

df['B'].replace({5: 50, 6: 60}, inplace=True)

df

Unnamed: 0,A,B,C
0,1,50,9
1,2,60,10
2,3,7,11
3,4,8,12


## Filling values with forward fill method

In [9]:
df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8], 'C': [9, 10, 11, 12]})

df.fillna(method='ffill', inplace=True)

df

Unnamed: 0,A,B,C
0,1.0,5.0,9
1,2.0,5.0,10
2,2.0,5.0,11
3,4.0,8.0,12


## Removing duplicate rows

In [10]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4], 'B': [5, 6, 6, 7, 8], 'C': [9, 10, 11, 12, 12]})

df.drop_duplicates(inplace=True)

df

Unnamed: 0,A,B,C
0,1,5,9
1,2,6,10
2,2,6,11
3,3,7,12
4,4,8,12


## Removing duplicate columns

In [11]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': [1, 2, 3, 4]})

df = df.T.drop_duplicates().T

df

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


## Detecting outliers with z-score

In [20]:
data = np.array([1, 2, 3, 4, 5, 10])

outliers = np.where(np.abs(data - np.mean(data)) > 2*np.std(data))

print(data[outliers])

[10]


## Removing outliers using z-score

In [21]:
data = np.array([1, 2, 3, 4, 5, 10])

z_scores = np.abs(data - np.mean(data)) / np.std(data)

data = data[z_scores < 2]

print(data)

[1 2 3 4 5]
