# Pandas3

# Handling Missing Data (NaN) in Pandas

In [1]:
'''
Missing data in pandas is typically represented as:

NaN (Not a Number) for numeric data

None or NaT for other types

🔹 Why is it Important?
Missing data can:

Break calculations (mean, sum, etc.)

Mislead machine learning models

Mess up plots and summaries
'''

'\nMissing data in pandas is typically represented as:\n\nNaN (Not a Number) for numeric data\n\nNone or NaT for other types\n\n🔹 Why is it Important?\nMissing data can:\n\nBreak calculations (mean, sum, etc.)\n\nMislead machine learning models\n\nMess up plots and summaries\n'

In [2]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [24, np.nan, 22, 29],
    'City': ['New York', 'Paris', np.nan, 'Berlin'],
    'Score': [85, 90, np.nan, 88]
}

df = pd.DataFrame(data)
print(df)

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob   NaN     Paris   90.0
2  Charlie  22.0       NaN    NaN
3     None  29.0    Berlin   88.0


In [6]:
#Detect Missing D
print(df.isna())  #df.isna() or df.isnull()   --> Returns True wherever the value is missing.
 
print("\n", df.isna().sum()) #Count missing values per column

    Name    Age   City  Score
0  False  False  False  False
1  False   True  False  False
2  False  False   True   True
3   True  False  False  False

 Name     1
Age      1
City     1
Score    1
dtype: int64


In [8]:
import pandas as pd
import numpy as np

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None, None],
    'Age': [24, np.nan, 22, 29, None],
    'City': ['New York', 'Paris', np.nan, 'Berlin', None],
    'Score': [85, 90, np.nan, 88, None]
}

df = pd.DataFrame(data)

print("🔸 Original DataFrame:\n")
print(df)

# A. Drop rows with any NaN
print("\n🔹 Drop rows with any NaN (df.dropna()): Keeps only fully complete rows\n")
print(df.dropna())

# B. Drop rows where all values are NaN
print("\n🔹 Drop rows where all values are NaN (df.dropna(how='all')): Removes only fully empty rows\n")
print(df.dropna(how='all'))

# C. Drop columns with any NaN
print("\n🔹 Drop columns with any NaN (df.dropna(axis=1)):  Removes columns that have at least one NaN.\n")
print(df.dropna(axis=1))

# D. Keep rows with at least 3 non-NaN values
print("\n🔹 Keep rows with at least 3 non-NaN values (df.dropna(thresh=3)): Useful when you want to preserve “mostly complete” rows.\n")
print(df.dropna(thresh=3))

🔸 Original DataFrame:

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob   NaN     Paris   90.0
2  Charlie  22.0       NaN    NaN
3     None  29.0    Berlin   88.0
4     None   NaN      None    NaN

🔹 Drop rows with any NaN (df.dropna()): Keeps only fully complete rows

    Name   Age      City  Score
0  Alice  24.0  New York   85.0

🔹 Drop rows where all values are NaN (df.dropna(how='all')): Removes only fully empty rows

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob   NaN     Paris   90.0
2  Charlie  22.0       NaN    NaN
3     None  29.0    Berlin   88.0

🔹 Drop columns with any NaN (df.dropna(axis=1)):  Removes columns that have at least one NaN.

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]

🔹 Keep rows with at least 3 non-NaN values (df.dropna(thresh=3)): Useful when you want to preserve “mostly complete” rows.

    Name   Age      City  Score
0  Alice  24.0  New York   85.0
1    Bob   NaN     Paris   90.0
3   N

In [10]:
# Fill Missing Data
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [24, np.nan, 22, 29],
    'City': ['New York', 'Paris', np.nan, 'Berlin'],
    'Score': [85, 90, np.nan, 88]
}

df = pd.DataFrame(data)
print("🔸 Original DataFrame:\n")
print(df)

# A. Fill all NaNs with a constant value (e.g., 0)
print("\n🅐 Fill all NaNs with 0: Replaces all NaNs with 0 \n")
print(df.fillna(0))

# B. Fill missing Age with 25 (specific column)
df_b = df.copy()
df_b['Age'].fillna(25, inplace=True)
print("\n🅑 Fill missing Age with 25: Fills missing Age with 25\n")
print(df_b)

# C. Fill Score with mean of Score column
df_c = df.copy()
df_c['Score'] = df_c['Score'].fillna(df_c['Score'].mean())
print("\n🅒 Fill Score with mean: Fills Score with the average of existing values.\n")
print(df_c)

# D. Forward fill (propagate previous row’s value)
print("\n🅓 Forward fill (method='ffill'): Fills NaN with value from above row\n")
print(df.fillna(method='ffill'))

# E. Backward fill (use next row’s value)
print("\n🅔 Backward fill (method='bfill'): Fills NaN with value from below row\n")
print(df.fillna(method='bfill'))

# F. Fill different values for each column
print("\n🅕 Fill with different values per column: Custom fill for each column.explain this more clearlly in one cell\n")
print(df.fillna({'Name': 'Unknown', 'Age': 0}))

🔸 Original DataFrame:

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob   NaN     Paris   90.0
2  Charlie  22.0       NaN    NaN
3     None  29.0    Berlin   88.0

🅐 Fill all NaNs with 0: Replaces all NaNs with 0 

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob   0.0     Paris   90.0
2  Charlie  22.0         0    0.0
3        0  29.0    Berlin   88.0

🅑 Fill missing Age with 25: Fills missing Age with 25

      Name   Age      City  Score
0    Alice  24.0  New York   85.0
1      Bob  25.0     Paris   90.0
2  Charlie  22.0       NaN    NaN
3     None  29.0    Berlin   88.0

🅒 Fill Score with mean: Fills Score with the average of existing values.

      Name   Age      City      Score
0    Alice  24.0  New York  85.000000
1      Bob   NaN     Paris  90.000000
2  Charlie  22.0       NaN  87.666667
3     None  29.0    Berlin  88.000000

🅓 Forward fill (method='ffill'): Fills NaN with value from above row

      Name   Age      C

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_b['Age'].fillna(25, inplace=True)
  print(df.fillna(method='ffill'))
  print(df.fillna(method='bfill'))


In [12]:
#Replace Specific Values with NaN
import pandas as pd
import numpy as np

# Sample DataFrame with placeholder values
data = {
    'Name': ['Alice', 'N/A', 'Charlie', 'NA'],
    'Age': [24, -999, 0, 29],
    'Score': [85, 90, -999, 0]
}

df = pd.DataFrame(data)
print("🔸 Original DataFrame with placeholders:\n")
print(df)

# A. Replace a single specific value (-999) with NaN
df_a = df.copy()
df_a.replace(-999, np.nan, inplace=True)
print("\n🅐 Replace -999 with NaN: Useful for filling numeric data in time series.\n")
print(df_a)

# B. Replace multiple placeholder values with NaN
df_b = df.copy()
df_b.replace(['N/A', 'NA', 0], np.nan, inplace=True)
print("\n🅑 Replace ['N/A', 'NA', 0] with NaN: Removes only rows where Name is missing.\n")
print(df_b)


🔸 Original DataFrame with placeholders:

      Name  Age  Score
0    Alice   24     85
1      N/A -999     90
2  Charlie    0   -999
3       NA   29      0

🅐 Replace -999 with NaN: Useful for filling numeric data in time series.

      Name   Age  Score
0    Alice  24.0   85.0
1      N/A   NaN   90.0
2  Charlie   0.0    NaN
3       NA  29.0    0.0

🅑 Replace ['N/A', 'NA', 0] with NaN: Removes only rows where Name is missing.

      Name    Age  Score
0    Alice   24.0   85.0
1      NaN -999.0   90.0
2  Charlie    NaN -999.0
3      NaN   29.0    NaN


In [14]:
#Interpolation + Drop by Column 
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [24, 26, 28, 30],
    'Score': [85, np.nan, np.nan, 90]
}

df = pd.DataFrame(data)
print("🔸 Original DataFrame:\n")
print(df)

# A. Interpolate missing numeric values in 'Score' column
print("\n🅐 Interpolated 'Score' using linear method: Useful for filling numeric data in time series.\n")
print(df['Score'].interpolate(method='linear'))

# (Optional) Store the interpolated version in a copy of df
df_interp = df.copy()
df_interp['Score'] = df_interp['Score'].interpolate(method='linear')

# B. Drop rows where 'Name' is missing
print("\n🅑 Drop rows where 'Name' is NaN: Useful for filling numeric data in time series.\n")
print(df.dropna(subset=['Name']))

🔸 Original DataFrame:

    Name  Age  Score
0  Alice   24   85.0
1    Bob   26    NaN
2   None   28    NaN
3  David   30   90.0

🅐 Interpolated 'Score' using linear method: Useful for filling numeric data in time series.

0    85.000000
1    86.666667
2    88.333333
3    90.000000
Name: Score, dtype: float64

🅑 Drop rows where 'Name' is NaN: Useful for filling numeric data in time series.

    Name  Age  Score
0  Alice   24   85.0
1    Bob   26    NaN
3  David   30   90.0


# Data Type Conversion (astype() in Pandas)

In [15]:
'''
In Pandas, each column has a data type:

int64 → integers

float64 → decimals

object → text (strings)

bool → Boolean (True/False)

datetime64 → date/time
'''

'\nIn Pandas, each column has a data type:\n\nint64 → integers\n\nfloat64 → decimals\n\nobject → text (strings)\n\nbool → Boolean (True/False)\n\ndatetime64 → date/time\n'

In [18]:
import pandas as pd
import numpy as np

# Sample DataFrame with mixed types and NaNs
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24.0, np.nan, 22.0, 29.0],
    'Score': [85, 90, 78, 88],
    'Graduated': [True, False, True, False],
    'Join_Date': ['2023-01-15', '2022-12-10', '2023-05-01', '2021-07-20'],
    'City': ['New York', 'Paris', 'New York', 'Berlin']
}

df = pd.DataFrame(data)

print("🔸 1. Check current data types: Shows data type of each column.\n")
print(df.dtypes)

# Fill NaN in Age to convert safely to int
df['Age'] = df['Age'].fillna(0)

print("\n🔸 2. Convert data types Use .astype() — must handle NaNs first (here filled Age NaNs with 0 before int conversion). Can convert to int, float, string, bool, or multiple columns.\n")
df['Age'] = df['Age'].astype(int)                   # float64 → int
df['Score'] = df['Score'].astype(float)             # int → float
df['Age'] = df['Age'].astype(str)                    # int → string
df['Graduated'] = df['Graduated'].astype(bool)       # bool remains bool
df = df.astype({'Score': 'float'})                    # multiple columns conversion (here only Score)

print(df.dtypes)

print("\n🔸 3. Convert to Date/Time using pd.to_datetime(): \n")
df['Join_Date'] = pd.to_datetime(df['Join_Date'])    # Convert string to datetime
print(df['Join_Date'])
# Also converting Year format example (create a Year column)
df['Year'] = pd.to_datetime(df['Join_Date']).dt.year
print("\nYear column (int):")
print(df['Year'])

print("\n🔸 4. Convert to Categorical Type:\n")
df['City'] = df['City'].astype('category')
print(df.dtypes)
print("\nMemory usage with deep=True:")
print(df.info(memory_usage='deep'))
'''
Explanation for each part:
Check data types: .dtypes shows type per column (object, float64, bool etc.).

Convert types: Use .astype() — must handle NaNs first (here filled Age NaNs with 0 before int conversion). Can convert to int, float, string, bool, or multiple columns.

Date conversion: Use pd.to_datetime() (not .astype()) to convert date strings into datetime objects; extract year easily.

Categorical: .astype('category') optimizes columns with few unique values (like cities) — reduces memory and speeds up some operations.
'''

🔸 1. Check current data types: Shows data type of each column.

Name          object
Age          float64
Score          int64
Graduated       bool
Join_Date     object
City          object
dtype: object

🔸 2. Convert data types Use .astype() — must handle NaNs first (here filled Age NaNs with 0 before int conversion). Can convert to int, float, string, bool, or multiple columns.

Name          object
Age           object
Score        float64
Graduated       bool
Join_Date     object
City          object
dtype: object

🔸 3. Convert to Date/Time using pd.to_datetime(): 

0   2023-01-15
1   2022-12-10
2   2023-05-01
3   2021-07-20
Name: Join_Date, dtype: datetime64[ns]

Year column (int):
0    2023
1    2022
2    2023
3    2021
Name: Year, dtype: int32

🔸 4. Convert to Categorical Type:

Name                 object
Age                  object
Score               float64
Graduated              bool
Join_Date    datetime64[ns]
City               category
Year                  int32
dtype: 

"\nExplanation for each part:\nCheck data types: .dtypes shows type per column (object, float64, bool etc.).\n\nConvert types: Use .astype() — must handle NaNs first (here filled Age NaNs with 0 before int conversion). Can convert to int, float, string, bool, or multiple columns.\n\nDate conversion: Use pd.to_datetime() (not .astype()) to convert date strings into datetime objects; extract year easily.\n\nCategorical: .astype('category') optimizes columns with few unique values (like cities) — reduces memory and speeds up some operations.\n"