#### Goal - Data Type Handling (Type Conversion)

In [1]:
# import dataset

import numpy as np
import pandas as pd
df = pd.read_csv('Indian Startup Funding.csv')
df

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,
...,...,...,...,...,...,...,...,...,...,...
3039,3040,29/01/2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000,
3040,3041,29/01/2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000,Govt backed VC Fund
3041,3042,30/01/2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000,
3042,3043,30/01/2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


In [2]:
# initial checkup

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [3]:
# drop unnecssary columns
df.drop(columns=['Sr No'], axis=1, inplace=True)

# replace column names, strip them and change to title case
df.columns = df.columns.str.strip().str.replace(' dd/mm/yyyy','').str.title()

# rename column names
df.rename(columns = {
    'Subvertical':'Sub Vertical', 
    'Investmentntype':'Investment Type'}, inplace=True)
df.head()

Unnamed: 0,Date,Startup Name,Industry Vertical,Sub Vertical,City Location,Investors Name,Investment Type,Amount In Usd,Remarks
0,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [4]:
# check data types
df.dtypes

Date                 object
Startup Name         object
Industry Vertical    object
Sub Vertical         object
City  Location       object
Investors Name       object
Investment Type      object
Amount In Usd        object
Remarks              object
dtype: object

#### Change data types of columns to -
- Date to datetime
- Startup Name to string
- Industry Vertical to string
- Sub Vertical to string
- City  Location to string
- Investors Name to string
- Investment Type to Category
- Amount In Usd to float
- Remarks to String

In [5]:
# before going in dtype change make sure all numerical values are in consistent format
# 'Amount In Usd' column contains ',' in between digits, remove/replace them

df['Amount In Usd'] = df['Amount In Usd'].str.replace(',','') # correcting inconsistent formats

# change 'Amount In Usd' to numeric,'to_numeric' Converts to numeric (int/float), fails on strings
df['Amount In Usd'] = pd.to_numeric(df['Amount In Usd'],errors='coerce') # coerce will convert invalid entries to NaN, instead of raising error

# Blank Handling Before Type Conversion, essential for numeric columns, else throws error during type conversion
# Replaces blanks ('') with NaN
# optional for non_numeric columns
df.replace('', np.nan)

Unnamed: 0,Date,Startup Name,Industry Vertical,Sub Vertical,City Location,Investors Name,Investment Type,Amount In Usd,Remarks
0,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000.0,
1,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394.0,
2,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860.0,
3,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000.0,
4,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000.0,
...,...,...,...,...,...,...,...,...,...
3039,29/01/2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000.0,
3040,29/01/2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000.0,Govt backed VC Fund
3041,30/01/2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000.0,
3042,30/01/2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


In [6]:
# memory usage before type conversion in bytes
mem_before = df.memory_usage(deep=True)
mem_before.sum()

1656247

In [7]:
# convert data types
# handle blanks before using astype
df = df.astype({
    'Startup Name': 'string',
    'Industry Vertical': 'string',
    'Sub Vertical': 'string',
    'City  Location': 'string',
    'Investors Name': 'string',
    'Investment Type': 'category',
    'Amount In Usd': 'float',  # u cant convert this to 'Int64' cz it contains NaN(missing values) or non integer values
    'Remarks': 'string'
})

df['Date'] = pd.to_datetime(df['Date'], errors='coerce') # specialized pandas function

# now check dtypes
df.dtypes

Date                 datetime64[ns]
Startup Name         string[python]
Industry Vertical    string[python]
Sub Vertical         string[python]
City  Location       string[python]
Investors Name       string[python]
Investment Type            category
Amount In Usd               float64
Remarks              string[python]
dtype: object

In [8]:
# memory usage after type conversion in bytes
mem_after = df.memory_usage(deep=True)
mem_after.sum()

1398183

In [9]:
# comparison table of change in memory size due to type conversion
mem_compare = pd.DataFrame({
    'Before (Bytes)': mem_before,
    'After (Bytes)': mem_after,
    'Reduced By (Bytes)': mem_before - mem_after,
    'Reduced (%)': ((mem_before - mem_after) / mem_before * 100).round(2)
})

# Display columns & their memory usage
mem_compare = mem_compare.loc[df.columns]
mem_compare

Unnamed: 0,Before (Bytes),After (Bytes),Reduced By (Bytes),Reduced (%)
Date,203956,24352,179604,88.06
Startup Name,201976,201976,0,0.0
Industry Vertical,219170,224642,-5472,-2.5
Sub Vertical,213861,243813,-29952,-14.01
City Location,191208,196968,-5760,-3.01
Investors Name,275505,276273,-768,-0.28
Investment Type,213393,8981,204412,95.79
Amount In Usd,24352,24352,0,0.0
Remarks,112694,196694,-84000,-74.54


In [10]:
# final total impact on memory usage due to type conversion (1024 bytes = 1 kilobytes(kb))

print(f"Memory usage BEFORE type conversion is {mem_before.sum()} bytes")
print(f"Memory usage AFTER type conversion is {mem_after.sum()} bytes")

if mem_before.sum() > mem_after.sum():
    print(f"Memory usage reduced due to type conversion is {mem_before.sum() - mem_after.sum()} bytes")
elif mem_before.sum() < mem_after.sum():
    print(f"Memory usage increased due to type conversion is {mem_after.sum() - mem_before.sum()} bytes")
else:
    print('No change happened due to type conversion')

Memory usage BEFORE type conversion is 1656247 bytes
Memory usage AFTER type conversion is 1398183 bytes
Memory usage reduced due to type conversion is 258064 bytes


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1292 non-null   datetime64[ns]
 1   Startup Name       3044 non-null   string        
 2   Industry Vertical  2873 non-null   string        
 3   Sub Vertical       2108 non-null   string        
 4   City  Location     2864 non-null   string        
 5   Investors Name     3020 non-null   string        
 6   Investment Type    3040 non-null   category      
 7   Amount In Usd      2065 non-null   float64       
 8   Remarks            419 non-null    string        
dtypes: category(1), datetime64[ns](1), float64(1), string(6)
memory usage: 195.8 KB
