# Goal : 2) Data Type & Format Handling

1) Requirements of type conversion
2) Changing or correcting data types
3) Correcting Inconsistent Formats
4) Unifying units of Measurement using in built functions like map,lambda,where,apply etc.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

# Sample Data
data = {
    'col': ['10', '', '25.5', '100', None],  # For blank handling & type conversion
    'name': ['Alice Smith', 'Bob-Jones', 'Charlie', 'David Brown', ''],  # For splitting
    'date_str': ['01/05/2022', '2022-06-15', '', '15-07-2023', '2021/12/31'],  # For to_datetime
    'gender': ['M', 'F', 'M', 'F', ''],  # For mapping
    'unit': ['cm', 'mm', 'g', 'm', 'g'],  # For unit mapping
    'value': [150, 3000, 500, 1.5, 250],  # For math transformations
    'height_cm': [170, 160, 150, 180, ''],  # For cm to meters
    'price_inr': [8300, 16600, 12450, 0, None],  # For INR to USD
    'flag_str': ['True', 'false', 'TRUE', '', 'False'],  # For bool conversion
    'custom_text': [' hello ', 'WORLD', '  test  ', '', 'Example'],  # For .apply(str.strip().title())
}

df = pd.DataFrame(data)
df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text
0,10.0,Alice Smith,01/05/2022,M,cm,150.0,170.0,8300.0,True,hello
1,,Bob-Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD
2,25.5,Charlie,,M,g,500.0,150.0,12450.0,True,test
3,100.0,David Brown,15-07-2023,F,m,1.5,180.0,0.0,,
4,,,2021/12/31,,g,250.0,,,False,Example


#### 2-0) requirements of type conversion

In [2]:
# check basic info
df.info()

# it shows most of the columns are of object data type
# we should conisder type conversion

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   col          4 non-null      object 
 1   name         5 non-null      object 
 2   date_str     5 non-null      object 
 3   gender       5 non-null      object 
 4   unit         5 non-null      object 
 5   value        5 non-null      float64
 6   height_cm    5 non-null      object 
 7   price_inr    4 non-null      float64
 8   flag_str     5 non-null      object 
 9   custom_text  5 non-null      object 
dtypes: float64(2), object(8)
memory usage: 532.0+ bytes


##### Handle blank values in dataset before going ahead

- replace blanks ('',' ','  ' etc) with NaN
- also replace values like ['N/A', 'n/a', 'null', 'None', 'na'] with np.nan
- Why blanks needs to be handled?
   - Because blanks (like '' or ' ') are not automatically treated as missing values by Pandas.
     So unless you convert them, they will:
        - Escape missing value detection
        - Break numerical conversion
        - Corrupt statistics (mean, median, etc.)
        - Lead to incorrect row counts or faulty imputation
- If blanks not handled then below problems happens :
     - pd.to_numeric() on '' fails ('' becomes NaN silently)
     - df.isna().sum() won’t count '' as missing
     - Mean/Median computation skewed if '' treated as 0 or string
     - ML models may crash or misinterpret values

In [3]:
# handle blanks before type conversion also

df = df.replace('',np.nan)
df.info()

# earlier data type of 'height_cm' column was object, now it changes to float64
# also that column earlier showing 5 non_null values cz it was considering blank as value
# now its 4, same for others too

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   col          3 non-null      object 
 1   name         4 non-null      object 
 2   date_str     4 non-null      object 
 3   gender       4 non-null      object 
 4   unit         5 non-null      object 
 5   value        5 non-null      float64
 6   height_cm    4 non-null      float64
 7   price_inr    4 non-null      float64
 8   flag_str     4 non-null      object 
 9   custom_text  4 non-null      object 
dtypes: float64(3), object(7)
memory usage: 532.0+ bytes


In [4]:
df  # all the blank values replaced with np.nan

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text
0,10.0,Alice Smith,01/05/2022,M,cm,150.0,170.0,8300.0,True,hello
1,,Bob-Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD
2,25.5,Charlie,,M,g,500.0,150.0,12450.0,True,test
3,100.0,David Brown,15-07-2023,F,m,1.5,180.0,0.0,,
4,,,2021/12/31,,g,250.0,,,False,Example


In [5]:
# handle inconsistency in values of flag_str column
# it would be our boolean column but values in it need consistency

# Custom Format Mapping, also can be done using .apply()
df['flag_str'] = df['flag_str'].str.lower().map({'true':True, 'false':False}) 
df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text
0,10.0,Alice Smith,01/05/2022,M,cm,150.0,170.0,8300.0,True,hello
1,,Bob-Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD
2,25.5,Charlie,,M,g,500.0,150.0,12450.0,True,test
3,100.0,David Brown,15-07-2023,F,m,1.5,180.0,0.0,,
4,,,2021/12/31,,g,250.0,,,False,Example


#### 2-1) Changing or correcting data types = 
astype(int), astype(float), astype(str), astype('category'), astype(bool), 
pd.to_datetime(), pd.to_numeric(), pd.to_numeric(errors='coerce'), 
pd.to_timedelta(), df['col'].replace('', np.nan).astype(float) (handle blanks before 
type conversion), infer_objects(), df['col'].apply(custom_function) (for complex 
or custom conversions) 

##### changing data types using astype()
- we will change data types of each column according to type of data available in them.
    - df['col'] contains decimal values so it will be float data type
    - df['name'] --> string values --> string data type
    - df['date_str'] --> dates --> datetime data type
    - df['gender'] --> categorical values --> category data type
    - df['unit'] --> categorical values --> category data type
    - df['value'] --> decimal values --> float64
    - df['height_cm'] --> integer values but NaN can't convert to integer so float64
    - df['price_inr'] --> integer values but NaN can't convert to integer so float64
    - df['flag_str'] --> boolean values --> Use 'boolean' if only True/False/NaN (convert with pd.Series.astype('boolean')); otherwise, 'string'
    - df['custom_text'] --> text values --> string data type

In [6]:
# data types before type conversion
dtb = df.dtypes  # another way to check data types

In [7]:
df = df.astype({
    'col' : 'float64',
    'name' : 'string',
    'date_str' : 'datetime64[ns]',
    'gender' : 'category',
    'unit' : 'category',
    'value' : 'float64',
    'height_cm' : 'float64',
    'price_inr' : 'float64',
    'flag_str' : 'boolean',
    'custom_text' : 'string'
})

df.dtypes

col                   float64
name           string[python]
date_str       datetime64[ns]
gender               category
unit                 category
value                 float64
height_cm             float64
price_inr             float64
flag_str              boolean
custom_text    string[python]
dtype: object

In [8]:
# data types after type conversion
dta = df.dtypes

In [9]:
before_after_type_conversion = pd.DataFrame({
    'Before Type Conversion':dtb,
    'After Type Conversion' : dta
})

before_after_type_conversion

Unnamed: 0,Before Type Conversion,After Type Conversion
col,object,float64
name,object,string[python]
date_str,object,datetime64[ns]
gender,object,category
unit,object,category
value,float64,float64
height_cm,float64,float64
price_inr,float64,float64
flag_str,object,boolean
custom_text,object,string[python]


In [10]:
df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text
0,10.0,Alice Smith,2022-01-05,M,cm,150.0,170.0,8300.0,True,hello
1,,Bob-Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD
2,25.5,Charlie,NaT,M,g,500.0,150.0,12450.0,True,test
3,100.0,David Brown,2023-07-15,F,m,1.5,180.0,0.0,,
4,,,2021-12-31,,g,250.0,,,False,Example


##### difference between 'NaN in name column' & 'NaN in height_cm column'

- 'np.nan' (used in numeric/float types, object, category):
    - It is a float-based missing value.
    - Does not support equality checks (np.nan == np.nan is False).
    - Comes from NumPy.
*
-  '<NA' (used in string, boolean, Int64, Float64):
    - Pandas’ own NA (pd.NA) designed for nullable types.
    - Better behavior for equality, comparisons, and aggregation.
    - Part of Pandas' newer nullable dtypes.

##### using specialized pandas functions
- pd.to_datetime(df['col'])	Converts to datetime
- pd.to_numeric(df['col'])	Converts to numeric (int/float), fails on strings
- pd.to_numeric(df['col'], errors='coerce')	Invalid entries become NaN instead of error
- pd.to_timedelta(df['col'])	Converts durations like '2 days', '5h' to timedelta objects

##### Other useful tools
- df.infer_objects()	Automatically converts columns with object dtype to best possible type
- df['col'].apply(custom_function)	Use for complex or row-wise custom conversion logic

#### 2-2) Correcting Inconsistent Formats =
str.replace(), str.extract(), str.split(), 
datetime.strftime() / strptime(), Regex-based pattern matching, 
Custom format mapping logic 

In [11]:
# 1. str.replace() = fixes typos, symbols, or unify formats
df['name'] = df['name'].str.replace('-', ' ')   

# 2. str.split() = Splits strings into parts based on a delimiter.
df[['first', 'last']] = df['name'].str.split(' ', n=1, expand=True) 
# ' ' means split on space
# n = no of splits
# expand = False gives a column of lists, keep expand = True always

In [12]:
df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text,first,last
0,10.0,Alice Smith,2022-01-05,M,cm,150.0,170.0,8300.0,True,hello,Alice,Smith
1,,Bob Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD,Bob,Jones
2,25.5,Charlie,NaT,M,g,500.0,150.0,12450.0,True,test,Charlie,
3,100.0,David Brown,2023-07-15,F,m,1.5,180.0,0.0,,,David,Brown
4,,,2021-12-31,,g,250.0,,,False,Example,,


In [13]:
# 3. datetime.strptime() / strftime() = Converts between strings and datetime objects.

# Convert datetime to string format
df['date_str'] = df['date_str'].dt.strftime('%Y-%m-%d') 

# Convert string to datetime (custom format). 
# This tells pandas to infer each row's format individually, useful when formats are inconsistent.
df['date_str'] = df['date_str'] = pd.to_datetime(df['date_str'], format='mixed', errors='coerce')  

# Another way to convert string to datetime when we have mixed datetime formats
# pandas infer automatically
#df['date_str'] = pd.to_datetime(df['date_str'], errors='coerce', dayfirst=False)

# This handles formats like:
# "2022-01-05" (%Y-%m-%d)
# "01/05/2022" (%d/%m/%Y)
# "15-07-2023" (%d-%m-%Y)
# "2021/12/31" (%Y/%m/%d)

df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text,first,last
0,10.0,Alice Smith,2022-01-05,M,cm,150.0,170.0,8300.0,True,hello,Alice,Smith
1,,Bob Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD,Bob,Jones
2,25.5,Charlie,NaT,M,g,500.0,150.0,12450.0,True,test,Charlie,
3,100.0,David Brown,2023-07-15,F,m,1.5,180.0,0.0,,,David,Brown
4,,,2021-12-31,,g,250.0,,,False,Example,,


#### 2-3) Unifying units of Measurement = 
- Manual mapping/dictionary replacement,
- Conditional logic (np.where, 
apply())
-  Standard mathematical transformations (scaling factors) 

In [14]:
# 1. Manual Mapping / Dictionary Replacement 
# Used when unit names are stored as strings or categories
unit_map = {'cm': 0.01, 'mm': 0.001, 'm': 1}
df['value_m'] = df['value'] * df['unit'].map(unit_map)

# 2. Conditional Logic (np.where, .apply())
# Use when transformation depends on condition per row.

# 2-1) With where()
import numpy as np
df['value_kg'] = np.where(df['unit'] == 'g', df['value'] / 1000, df['value'])

# 2-2) With apply()
df['value_standard'] = df.apply(
    lambda row: row['value'] / 100 if row['unit'] == 'cm' else row['value'],
    axis=1
)

# 3. Standard Mathematical Transformations
# Use multiplication/division to convert units (scaling factors).

# cm to meters
df['height_m'] = df['height_cm'] / 100

# INR to USD (example rate)
df['price_usd'] = df['price_inr'] / 83.0

In [15]:
# final dataframe
df

Unnamed: 0,col,name,date_str,gender,unit,value,height_cm,price_inr,flag_str,custom_text,first,last,value_m,value_kg,value_standard,height_m,price_usd
0,10.0,Alice Smith,2022-01-05,M,cm,150.0,170.0,8300.0,True,hello,Alice,Smith,1.5,150.0,1.5,1.7,100.0
1,,Bob Jones,2022-06-15,F,mm,3000.0,160.0,16600.0,False,WORLD,Bob,Jones,3.0,3000.0,3000.0,1.6,200.0
2,25.5,Charlie,NaT,M,g,500.0,150.0,12450.0,True,test,Charlie,,,0.5,500.0,1.5,150.0
3,100.0,David Brown,2023-07-15,F,m,1.5,180.0,0.0,,,David,Brown,1.5,1.5,1.5,1.8,0.0
4,,,2021-12-31,,g,250.0,,,False,Example,,,,0.25,250.0,,
