#### Tasks Day 24

##### Sample data

In [2]:
import pandas as pd

data = {
    'name': ['John', None, 'Alice', 'Bob', 'Charlie'],
    'age': [25, None, 30, 22, None],
    'salary': [50000, 60000, None, None, 70000]
}

df = pd.DataFrame(data)
print(df)


      name   age   salary
0     John  25.0  50000.0
1     None   NaN  60000.0
2    Alice  30.0      NaN
3      Bob  22.0      NaN
4  Charlie   NaN  70000.0


##### 1. df.isnull()

In [3]:
# Shows True/False where data is missing
df.isnull()


Unnamed: 0,name,age,salary
0,False,False,False
1,True,True,False
2,False,False,True
3,False,False,True
4,False,True,False


##### 2. df.isnull().sum()

In [4]:
# Counts missing values column-wise

df.isnull().sum()


name      1
age       2
salary    2
dtype: int64

##### 3. df.isnull().sum().sum()

In [5]:
# Counts total missing values in the whole dataframe
df.isnull().sum().sum()


np.int64(5)

##### 4. df.dropna()

In [6]:
# Removes rows that contain any missing values
# Removes rows with ANY NaN

df.dropna()


Unnamed: 0,name,age,salary
0,John,25.0,50000.0


##### 5. df.dropna(how='all')

In [7]:
# Drops rows where all values are missing
# Since no row is fully empty, it keeps the same dataframe

df.dropna(how='all')

Unnamed: 0,name,age,salary
0,John,25.0,50000.0
1,,,60000.0
2,Alice,30.0,
3,Bob,22.0,
4,Charlie,,70000.0


##### 6. df.dropna(axis=1)

In [8]:
# Drops columns that have any missing values
# Remove any column having NULL values

df.dropna(axis=1)


0
1
2
3
4


##### 7. df.fillna(0)

In [9]:
# Replaces all missing values with 0

df.fillna(0)


Unnamed: 0,name,age,salary
0,John,25.0,50000.0
1,0,0.0,60000.0
2,Alice,30.0,0.0
3,Bob,22.0,0.0
4,Charlie,0.0,70000.0


##### 8. df['column'].fillna(df['column'].mean(), inplace=True)

In [10]:
# Fills missing numeric values with the column mean
df['age'] = df['age'].fillna(round(df['age'].mean(),3))
print(df)



      name     age   salary
0     John  25.000  50000.0
1     None  25.667  60000.0
2    Alice  30.000      NaN
3      Bob  22.000      NaN
4  Charlie  25.667  70000.0


##### 9. df['column'].fillna(df['column'].mode()[0], inplace=True)

In [11]:
# Fills missing values with the most frequent value
# Mode = "Alice" (alphabetically first tie)
# mode()[0] picks the first "Alice"
# Fill missing values "Alice" replaces NaN

df['name'] = df['name'].fillna(df['name'].mode()[0])
print(df)


      name     age   salary
0     John  25.000  50000.0
1    Alice  25.667  60000.0
2    Alice  30.000      NaN
3      Bob  22.000      NaN
4  Charlie  25.667  70000.0


##### 10. df.interpolate()

In [12]:
# df.interpolate() works only for numeric patterns
# Fills numeric values using linear progression
# Linear progression - for filling the gaps in tables
# Fill missing values by estimating them based on nearby values
# It does not use mean or median - instead, it tries to predict the logical missing value using a smooth pattern

# Convert possible object columns into numeric dtype if appropriate
df = df.infer_objects(copy=False)

# infer_objects() - numbers may be stored as object type (text format) converts to numeric format
# copy=False - Update original dataframe
# copy=True - Create a new copy and leave the original unchanged


df.interpolate(inplace=True)
print(df)


      name     age        salary
0     John  25.000  50000.000000
1    Alice  25.667  60000.000000
2    Alice  30.000  63333.333333
3      Bob  22.000  66666.666667
4  Charlie  25.667  70000.000000


  df.interpolate(inplace=True)


##### 11. from sklearn.impute import SimpleImputer 
##### 12. imputer = SimpleImputer(strategy='median') 

##### 13. imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) 

In [13]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
df[['age']] = imputer.fit_transform(df[['age']])
print(df)

# A missing value, replace it using the median of the column
# Apply/Train the imputer - imputer.fit(df[['age']]) - calculates the median of the age column
# Transform (replace missing values) - imputer.transform(df[['age']]) - This replaces all NaN values with the median


      name     age        salary
0     John  25.000  50000.000000
1    Alice  25.667  60000.000000
2    Alice  30.000  63333.333333
3      Bob  22.000  66666.666667
4  Charlie  25.667  70000.000000
