In [1]:
import pandas as pd
import numpy as np

## Identifying and Handling Missing Data

In [2]:
data = {
    'A': [1, 2, np.nan, 4, None],
    'B': [None, 6, 7, 8, np.nan],
    'C': [10, 20, 30, 40, 50]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,6.0,20
2,,7.0,30
3,4.0,8.0,40
4,,,50


In [3]:
missing_data = df.isna()
missing_data

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,True,True,False


In [4]:
missing_data = df.isnull()
missing_data

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,True,True,False


In [5]:
not_missing_data = df.notnull()
not_missing_data

Unnamed: 0,A,B,C
0,True,False,True
1,True,True,True
2,False,True,True
3,True,True,True
4,False,False,True


In [6]:
df_dropped = df.dropna()
df_dropped

Unnamed: 0,A,B,C
1,2.0,6.0,20
3,4.0,8.0,40


In [7]:
df

Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,6.0,20
2,,7.0,30
3,4.0,8.0,40
4,,,50


In [8]:
df.dropna(inplace=True)
df

Unnamed: 0,A,B,C
1,2.0,6.0,20
3,4.0,8.0,40


In [11]:
data = {
    'A': [1, 2, np.nan, 4, None],
    'B': [None, 6, 7, 8, np.nan],
    'C': [10, 20, 30, np.nan, 50]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1.0,,10.0
1,2.0,6.0,20.0
2,,7.0,30.0
3,4.0,8.0,
4,,,50.0


In [12]:
df_dropped_subset = df.dropna(subset=['A', 'B'])
df_dropped_subset

Unnamed: 0,A,B,C
1,2.0,6.0,20.0
3,4.0,8.0,


In [13]:
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 10, 20, None, 50]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1.0,
1,2.0,10.0
2,,20.0
3,4.0,
4,5.0,50.0


In [14]:
constant_filled_df = df.fillna(0)
constant_filled_df

Unnamed: 0,A,B
0,1.0,0.0
1,2.0,10.0
2,0.0,20.0
3,4.0,0.0
4,5.0,50.0


In [16]:
mean_filled_df_A = df.copy()
mean_filled_df_A['A'] = df['A'].fillna(df['A'].mean())
mean_filled_df_A

Unnamed: 0,A,B
0,1.0,
1,2.0,10.0
2,3.0,20.0
3,4.0,
4,5.0,50.0


In [17]:
median_filled_df_B = df.copy()
median_filled_df_B['B'] = df['B'].fillna(df['B'].median())
median_filled_df_B

Unnamed: 0,A,B
0,1.0,20.0
1,2.0,10.0
2,,20.0
3,4.0,20.0
4,5.0,50.0


In [18]:
data = {'Country': ['USA', 'Canada', None, 'Germany', None, 'India']}

df = pd.DataFrame(data)
df

Unnamed: 0,Country
0,USA
1,Canada
2,
3,Germany
4,
5,India


In [19]:
specific_filled_df = df['Country'].fillna('Unknown')
specific_filled_df

0        USA
1     Canada
2    Unknown
3    Germany
4    Unknown
5      India
Name: Country, dtype: object

In [20]:
type(specific_filled_df)

pandas.core.series.Series

In [21]:
specific_filled_df = df.fillna('Unknown')
specific_filled_df

Unnamed: 0,Country
0,USA
1,Canada
2,Unknown
3,Germany
4,Unknown
5,India


In [22]:
mode_country = df.mode()
mode_country

Unnamed: 0,Country
0,Canada
1,Germany
2,India
3,USA


In [28]:
data = {
    'A': [1, 2, 1, 3, 1, 4],
    'B': [1, 2, 3, 4, 5, 6]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,1
1,2,2
2,1,3
3,3,4
4,1,5
5,4,6


In [29]:
df.mode()

Unnamed: 0,A,B
0,1.0,1
1,,2
2,,3
3,,4
4,,5
5,,6


In [30]:
data = {
    'A': [1, 2, 1, 3, 1, 4],
    'B': [1, 2, 3, 4, 5, 1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,1
1,2,2
2,1,3
3,3,4
4,1,5
5,4,1


In [31]:
df.mode()

Unnamed: 0,A,B
0,1,1


In [32]:
data = {'Country': ['USA', 'Canada', None, 'Germany', None, 'India']}

df = pd.DataFrame(data)
df

Unnamed: 0,Country
0,USA
1,Canada
2,
3,Germany
4,
5,India


In [34]:
mode_country = df['Country'].mode()
mode_country

0     Canada
1    Germany
2      India
3        USA
Name: Country, dtype: object

In [35]:
mode_filled_df = df.fillna(mode_country[0])
mode_filled_df

Unnamed: 0,Country
0,USA
1,Canada
2,Canada
3,Germany
4,Canada
5,India


In [36]:
mode_filled_df.mode()

Unnamed: 0,Country
0,Canada


In [37]:
data = {
    'A': [1, 2, None, 4, None, 6],
    'B': [3, None, 7, None, 11, 13]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,
2,,7.0
3,4.0,
4,,11.0
5,6.0,13.0


In [38]:
forward_filled_df = df.ffill()
forward_filled_df

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,3.0
2,2.0,7.0
3,4.0,7.0
4,4.0,11.0
5,6.0,13.0


In [39]:
backward_filled_df = df.bfill()
backward_filled_df

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,7.0
2,4.0,7.0
3,4.0,11.0
4,6.0,11.0
5,6.0,13.0


In [40]:
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, np.nan], 
    'B': [5, np.nan, 7, 8, 9]
})

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
2,,7.0
3,4.0,8.0
4,,9.0


In [41]:
data_linear_interpolated = df.interpolate(method='linear')
data_linear_interpolated

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0
4,4.0,9.0


For the missing value in column B at index 1, pandas performs linear interpolation
between the data points (1.0, 5.0) and (4.0, 8.0) to estimate the value as 6.0.

For the missing value in column A at index 2, pandas performs linear interpolation
between the data points (2.0, 6.0) and (4.0, 8.0) to estimate the value as 3.0.

The other missing value in column A at index 4 is filled with the value 4.0 because
there is no other data point to perform the interpolation.

### Time Based Interpolation

In [42]:
time_index = pd.date_range(start='2023-01-01', periods=5, freq='D')
time_series_data = pd.Series([10, np.nan, 30, np.nan, 50], index=time_index)
time_series_data

2023-01-01    10.0
2023-01-02     NaN
2023-01-03    30.0
2023-01-04     NaN
2023-01-05    50.0
Freq: D, dtype: float64

In [43]:
time_series_interpolated = time_series_data.interpolate(method='time')
time_series_interpolated

2023-01-01    10.0
2023-01-02    20.0
2023-01-03    30.0
2023-01-04    40.0
2023-01-05    50.0
Freq: D, dtype: float64

## Handling Duplicates

In [44]:
data = {
    'ID': [1, 2, 3, 4, 1, 5, 2],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Eve', 'Bob'],
    'Age': [25, 30, 22, 28, 25, 29, 30]
}

df = pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22
3,4,David,28
4,1,Alice,25
5,5,Eve,29
6,2,Bob,30


In [45]:
duplicates = df.duplicated()
duplicates

0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [47]:
duplicates_subset = df.duplicated(subset=['ID', 'Name'])
duplicates_subset

0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [48]:
df_no_duplicates = df.drop_duplicates()
df_no_duplicates

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22
3,4,David,28
5,5,Eve,29
