In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 🌾 **Stage 4: Expert Level

In [165]:
startup_funding_dataset = pd.read_csv('../data/startup_funding.csv', parse_dates=['Date dd/mm/yyyy'], index_col='Sr No')
startup_funding_dataset.head()

Unnamed: 0_level_0,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [166]:
startup_funding_dataset.info(), startup_funding_dataset.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 3044 entries, 1 to 3044
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Date dd/mm/yyyy    3044 non-null   object
 1   Startup Name       3044 non-null   object
 2   Industry Vertical  2873 non-null   object
 3   SubVertical        2108 non-null   object
 4   City  Location     2864 non-null   object
 5   Investors Name     3020 non-null   object
 6   InvestmentnType    3040 non-null   object
 7   Amount in USD      2084 non-null   object
 8   Remarks            419 non-null    object
dtypes: object(9)
memory usage: 237.8+ KB


(None,
 Date dd/mm/yyyy         0
 Startup Name            0
 Industry Vertical     171
 SubVertical           936
 City  Location        180
 Investors Name         24
 InvestmentnType         4
 Amount in USD         960
 Remarks              2625
 dtype: int64)

## Cleaning column Startup Name

### 1. Trim whitespace (leading/trailing).

In [167]:
# working on extract column company_canonical
startup_funding_dataset['company_canonical'] = startup_funding_dataset['Startup Name'].str.strip()

### 2. Remove unnecessary punctuation

extra spaces

trailing dots

duplicated characters

In [168]:
import re

In [169]:
text = "Hello!!!  World???  Pythonis amazing.com!!"
cleaned = re.sub(r'[^\w\s\.]', ' ', text)
cleaned_text = re.sub(r'\s+', ' ', cleaned)
print(cleaned_text) 

Hello World Pythonis amazing.com 


In [170]:
startup_funding_dataset['company_canonical'] = startup_funding_dataset.apply(
    lambda row: re.sub(
        r'\s+',' ',re.sub(r'[^\w\s\.]',' ', row['company_canonical'])
    ).strip(),
    axis=1
)

In [171]:
startup_funding_dataset.head()

Unnamed: 0_level_0,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks,company_canonical
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,,BYJU S
2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,,Shuttl
3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,,Mamaearth
4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,,https www.wealthbucket.in
5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,,Fashor


In [172]:
(startup_funding_dataset[startup_funding_dataset['Startup Name'] != startup_funding_dataset['company_canonical']]).head()

Unnamed: 0_level_0,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks,company_canonical
Sr No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,,BYJU S
4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,,https www.wealthbucket.in
54,23/08/2019,Lo! Foods,Consumer Goods,Low carb food for Diabetics,Bengaluru,"Rashmi Daga (founder, FreshMenu), Raveen Sastr...",Seed Round,500000,,Lo Foods
68,10/07/2019,"""BYJU\\'S""",EdTech,Education,Bengaluru,Qatar Investment Authority,Private Equity Round,150000000,,BYJU S
119,04/01/2019,Samunnati Financial Intermediation & Services ...,Finance,Non-banking financial company,Chennai,"MASSIF, a Dutch government fund",Debt-Funding,5000000,,Samunnati Financial Intermediation Services Pv...


### 3. Remove common domain endings and choose one case

* Flipkart.com → Flipkart
* PayTM vs Paytm (choose one canonical)

In [173]:
text = "Hello!!!  .World???  Pythonis amazing.com!!"
cleaned = re.sub(r'[^\w\s\.]', ' ', text)
cleaned_text = re.sub(r'\s+', ' ', cleaned).strip()
removing_domains = re.sub(r'(?<=\.)\w+$', '', cleaned_text).strip('.').strip()
print(removing_domains) 

Hello .World Pythonis amazing


In [174]:
startup_funding_dataset['company_canonical'] = startup_funding_dataset.apply(
    lambda row: re.sub(
    r'(?<=\.)\w+$', ' ', row['company_canonical']
    ).strip().strip('.').strip().lower().replace('https www.', '').replace('http www.', '').strip().replace('.', '').strip(),
    axis=1
)

### 4. Detect duplicates

Flipkart flipkart FlipKart
→ all become “Flipkart”

In [175]:
def delete_duplicated(text):
    words = text.split(' ')
    cleaned_text = []
    for i, word in enumerate(words):
        if i==0 or word != words[i-1]:
            cleaned_text.append(word)
    return " ".join(cleaned_text)
delete_duplicated("flipkart flipkart flipkart sjddj ksd 22 flipkart")

'flipkart sjddj ksd 22 flipkart'

In [176]:
startup_funding_dataset['company_canonical'] = startup_funding_dataset['company_canonical'].apply(delete_duplicated)

In [177]:
company_mapping = {k:(v['Startup Name'], v['company_canonical']) for k, v in startup_funding_dataset.iterrows()}
company_mapping

{1: ('BYJU’S', 'byju s'),
 2: ('Shuttl', 'shuttl'),
 3: ('Mamaearth', 'mamaearth'),
 4: ('https://www.wealthbucket.in/', 'wealthbucket'),
 5: ('Fashor', 'fashor'),
 6: ('Pando', 'pando'),
 7: ('Zomato', 'zomato'),
 8: ('Ecozen', 'ecozen'),
 9: ('CarDekho', 'cardekho'),
 10: ('Dhruva Space', 'dhruva space'),
 11: ('Rivigo', 'rivigo'),
 12: ('Healthians', 'healthians'),
 13: ('Licious', 'licious'),
 14: ('InCred', 'incred'),
 15: ('Trell', 'trell'),
 16: ('Rein Games', 'rein games'),
 17: ('Lenskart.com', 'lenskart'),
 18: ('Freshworks', 'freshworks'),
 19: ('Misters', 'misters'),
 20: ('Sunstone Eduversity Pvt. Ltd', 'sunstone eduversity pvt ltd'),
 21: ('Burger Singh', 'burger singh'),
 22: ('Healthians', 'healthians'),
 23: ('Ninjacart', 'ninjacart'),
 24: ('Aye Finance', 'aye finance'),
 25: ('SuperGaming', 'supergaming'),
 26: ('Clumio', 'clumio'),
 27: ('eBikeGo', 'ebikego'),
 28: ('Digital Mall Asia', 'digital mall asia'),
 29: ('Medikabazaar', 'medikabazaar'),
 30: ('Vogo Automot