# AmbitionBox Data Cleaning Pipeline

This notebook performs comprehensive data preprocessing on the AmbitionBox dataset to prepare it for Exploratory Data Analysis (EDA).

**Steps involved:**
1. Loading the raw dataset.
2. Implementing a robust numeric converter for 'k' (Thousands) and 'L' (Lakhs) suffixes.
3. Cleaning metric columns (reviews, salaries, etc.).
4. Parsing `service_type` into `industry`, `hq`, and `more_locations` count.
5. Normalizing data types and removing redundant columns.

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv('ambitionbox.csv')
print("Initial Dataset Shape:", df.shape)
df.head()

In [None]:
def convert_to_numeric(val):
    """
    Robustly converts strings with 'k' and 'L' suffixes to actual numeric values.
    E.g., '1.1L' -> 110000, '71.4k' -> 71400, '718' -> 718
    """
    if pd.isna(val):
        return 0.0
    
    val = str(val).lower().replace(',', '')
    
    # Extract numeric part (handling decimals)
    match = re.search(r'(\d+\.?\d*)', val)
    if not match:
        return 0.0
    
    num = float(match.group(1))
    
    # Apply scaling based on suffixes
    if 'l' in val:
        num *= 100000
    elif 'k' in val:
        num *= 1000
        
    return num

In [None]:
# List of columns that represent numeric counts
metrics = ['reviews', 'salaries', 'interviews', 'jobs', 'benefits', 'photos']

for col in metrics:
    if col in df.columns:
        df[col] = df[col].apply(convert_to_numeric)

# Clean ratings (convert to float, fill missing with 0)
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce').fillna(0.0)

df[metrics + ['ratings']].head()

In [None]:
# Split service_type into industry and location components
# Format: 'Industry | HQ Location +X more'

df['industry'] = df['service_type'].astype(str).str.split('|').str[0].str.strip()
df['location_info'] = df['service_type'].astype(str).str.split('|').str[1].str.strip()

# Extract Headquarters (HQ) and count of additional locations
df['hq'] = df['location_info'].str.split('+').str[0].str.strip()
df['more_locations'] = df['location_info'].str.extract(r'\+(\d+)')

# Convert more_locations to integer, default 0 if missing
df['more_locations'] = pd.to_numeric(df['more_locations'], errors='coerce').fillna(0).astype(int)

df[['industry', 'hq', 'more_locations']].head()

In [None]:
# Drop intermediate and redundant columns
cols_to_drop = ['Unnamed: 0', 'service_type', 'location_info', 'services_type', 'locations']
df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore', inplace=True)

# Reorder columns for optimal EDA readability
ordered_cols = ['name', 'industry', 'ratings', 'reviews', 'salaries', 'interviews', 'jobs', 'benefits', 'photos', 'hq', 'more_locations']
df = df[[c for c in ordered_cols if c in df.columns]]

print("Final Dataset Info:")
df.info()
df.head()

In [None]:
df.to_csv("ambitionbox_cleaned.csv", index=False)
print("Success: Cleaned dataset saved as 'ambitionbox_cleaned.csv'")