In [13]:
import os
import pandas as pd
from pathlib import Path

# Use current working directory and adjust
base_path = Path(os.getcwd()).parent / "data"

df_1 = pd.read_csv(base_path / "leads_dataset_1.csv")
df_2 = pd.read_csv(base_path / "leads_dataset_2.csv")


  df_2 = pd.read_csv(base_path / "leads_dataset_2.csv")


# Wrangling

#### The two datasets have overlapping columns. 
1. Emails IDs are present only in df_2 and are spread across 2 fields: Company and Website. 
2. We will combine them into a single column named 'Email' 

In [14]:
import re

# Basic email pattern
email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w{2,}$'

def find_email_columns(df):
    email_cols = []
    for col in df.columns:
        # Check only string columns
        if df[col].dtype == 'object':
            sample_values = df[col].dropna().astype(str).head(500)  # limit for speed
            if any(re.match(email_pattern, val.strip()) for val in sample_values if isinstance(val, str)):
                email_cols.append(col)
    return email_cols

# Run on both dataframes
email_columns_df1 = find_email_columns(df_1)
email_columns_df2 = find_email_columns(df_2)

print("DF1 columns containing emails:", email_columns_df1)
print("DF2 columns containing emails:", email_columns_df2)
import re

def extract_email(val):
    if isinstance(val, str):
        match = re.search(r'[\w\.-]+@[\w\.-]+\.\w{2,}', val)
        return match.group(0) if match else None
    return None

df_2['Email'] = df_2[['Company', 'Mobile Number', 'Website']].apply(
    lambda row: next((extract_email(x) for x in row if extract_email(x)), None),
    axis=1
)
(df_2['Email'].unique() )

DF1 columns containing emails: []
DF2 columns containing emails: ['Company', 'Mobile Number', 'Website']


array([None, 'vinod.shah1@gmail.com', 'Yogeshsadarang@yahoo.in',
       'robbinpandita100@gmail.com', 'vkaradkar@in.imshealth.com',
       'rakshata.nikam@sharekhan.com', 'smart.ashwin@yahoo.com',
       'nehamenon26@rediffmail.com', 'sankpal1pooja@gmail.com',
       'vidula2211@gmail.com', 'manish.chand97@Yahoo.com',
       'rombo@mai.com', 'nidhisingh9029291638@gmail.com',
       'anjalimeena5991@gmail.com', 'snpnd45@gmail.com',
       'dhananjay_joshi2007@gmail.com', 'Vaibhavi10rhyme@yahoo.co.in',
       'lalitashetty210992@gmail.com', 'sagar.arya89@gmail.com',
       'paulhilda205@yahoo.com', 'mrprabhutendolkar@gmail.com',
       'rudrakshi1791@gmail.com', 'kkpranay@gmail.com',
       'sheeri230@hotmail.com', 'drvrundagb@gmail.com',
       'sudarshan.mishra@dizsupportad.com', 'shashankj1291@gmail.com',
       'hildabarretto2015@gmail.com', 'hepsi.cfy@gmail.com',
       'Sabhjit8182@rediffmail.com', 'gaurav.kataria@ericsson.com',
       'Tirthasathe@gmail.com', 'ansaribushra93@gmail

3. We can merge the two dataframes now 

In [3]:
# Only keep rows in df2 where Extracted Email is present
# df2_with_emails = df_2[df_2['Email'].notnull()]

# Merge df1 with filtered df2
df_merged = df_1.merge(df_2, on='Lead Number', how='inner')

# Get all columns in the merged dataframe
all_columns = df_merged.columns.tolist()
df_merged.to_csv(r'C:\Users\AakashAI\Desktop\Repositories\Sales Agent\merged_leads.csv', index=False)
# Print results
print("Total columns:", len(all_columns))
print("New Dataset Size:", len(df_merged))



Total columns: 159
New Dataset Size: 9240


In [21]:
len(df_merged['Email'].unique())

60

4. We are going to fileter the leads that came from landing page submission or have an email listed


In [25]:
# Filter leads from 'Landing Page Submission' and having either a company or email listed
filtered_leads = df_merged[
    (df_merged['Lead Origin_x'].str.strip().str.lower() == 'landing page submission') &
    (
        df_merged['Company'].notnull() |
        df_merged['Email'].notnull()
    )
]
#drop columns that are more than 70% sparse
# filtered_leads = filtered_leads.loc[:, filtered_leads.isnull().mean() < 0.7]

# Save to new file
output_path = r'C:\Users\AakashAI\Desktop\Repositories\Sales Agent\landing_page_leads.csv'
filtered_leads.to_csv(output_path, index=False)

# Print summary
print("Filtered leads count:", len(filtered_leads))
print("Output saved to:", output_path)


Filtered leads count: 84
Output saved to: C:\Users\AakashAI\Desktop\Repositories\Sales Agent\landing_page_leads.csv


In [26]:
filtered_leads['Email']

18                                    None
115                Yogeshsadarang@yahoo.in
271                                   None
479             vkaradkar@in.imshealth.com
616           rakshata.nikam@sharekhan.com
                       ...                
8387                                  None
8432                                  None
8473                                  None
8748                                  None
8867    shubham.jain1@maxlifeinsurance.com
Name: Email, Length: 84, dtype: object

In [27]:
necessary_columns = {
    # 'Lead Number_x': 'Lead Number',
    'Lead Number' : 'Lead Number',
    'Lead Source_x': 'Lead Source',
    'Lead Origin_x': 'Lead Origin',
    'Do Not Email_x': 'Do Not Email',
    'Do Not Call_x': 'Do Not Call',
    'Converted': 'Converted',
    # 'Country_x': 'Country',
    'Mobile Number': 'Mobile Number',
    'Lead Stage': 'Lead Stage',
    'Lead Grade': 'Lead Grade',
    'Lead Score': 'Lead Score',
    # 'Age': 'Age',
    'Email': 'Email',
    'City': 'City',
    'Company': 'Company',
    
}

In [28]:
df_clean = filtered_leads[list(necessary_columns.keys())].rename(columns=necessary_columns)
df_clean.to_csv(r'C:\Users\AakashAI\Desktop\Repositories\Sales Agent\cleaned_leads.csv', index=False)

In [29]:
len(df_clean['Company'].unique())

67

In [30]:
len(df_clean['Email'].unique())

20

In [31]:
(df_clean)

Unnamed: 0,Lead Number,Lead Source,Lead Origin,Do Not Email,Do Not Call,Converted,Mobile Number,Lead Stage,Lead Grade,Lead Score,Email,City,Company
18,660522,Google,Landing Page Submission,No,No,1,,Qualified,,165,,Mumbai,Dr. Ram Manohar Lohia Avadh University /ugc
115,659357,Google,Landing Page Submission,No,No,0,,Unreachable,,120,Yogeshsadarang@yahoo.in,Mumbai,Yogeshsadarang@yahoo.in
271,657572,Referral Sites,Landing Page Submission,No,No,1,,Qualified,,180,,Other Metro Cities,vinoba bhave university
479,655287,Direct Traffic,Landing Page Submission,No,No,0,,Not Interested,,90,vkaradkar@in.imshealth.com,Mumbai,vkaradkar@in.imshealth.com
616,654061,Direct Traffic,Landing Page Submission,No,No,1,,Qualified,B,80,rakshata.nikam@sharekhan.com,Mumbai,rakshata.nikam@sharekhan.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8387,585811,Organic Search,Landing Page Submission,No,No,0,,Unreachable,B,170,,Select,Social Kinnect
8432,585443,Google,Landing Page Submission,No,No,1,,Closed,,120,,Thane & Outskirts,mumbai university
8473,585174,Organic Search,Landing Page Submission,No,No,1,,Qualified,C,245,,Select,Kumar Metals
8748,583069,Google,Landing Page Submission,No,No,1,,Qualified,,90,,Thane & Outskirts,Graduate from mumbai university
