#### Objective
To automatically update missing or outdated contact details in a formatted directory by matching industry names with a reference industry dataset.

#### Brief Decsription
This script performs intelligent matching between two Excel files using a cleaned version of industry names.

* Industry names are standardized by removing prefixes (e.g., M/s), suffixes (e.g., Pvt Ltd), and special characters.
* Matching is performed using the cleaned industry names instead of exact text matches.
* When a match is found:
   * Contact Person Name
   * Phone Number
   * Email
   are copied from the industry reference file into the format file.
* Existing values in the format file are preserved if already present.
* The final updated dataset is saved as a new Excel file.

This approach avoids issues caused by name spelling variations and ensures maximum contact data completeness in the recycler directory.

#### Adding Name, Phone Number, and Email

In [57]:
import pandas as pd
import re

def clean_industry_name(name):
    if pd.isna(name):
        return ""
    # Remove all forms of M/S and Pvt Ltd variations
    name = re.sub(r'\b(M/?S[:.]?|M/s[:.]?)\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\b(PVT\.?\s*LTD|PRIVATE\s*LIMITED|Pvt\.?\s*Ltd\.?|Pvt\s*Ltd\.?|PVT\s*LTD|Ltd\.?|LTD|PRIVATE\.?\s*LTD|Pvt\.?\s*LTD|pvt\s*ltd)\b', '', name, flags=re.IGNORECASE)
    # Remove special characters except space
    name = re.sub(r'[^a-zA-Z0-9 ]', '', name)
    return name.strip()

def update_cpn(format_file_path, industry_file_path, output_file_path):
    # Load both Excel files
    format_df = pd.read_excel(format_file_path)
    industry_df = pd.read_excel(industry_file_path)
    
    # Clean 'Industry Name' columns for comparison
    format_df['Cleaned Industry Name'] = format_df['Industry Name'].apply(clean_industry_name)
    industry_df['Cleaned Industry Name'] = industry_df['IndustryName'].apply(clean_industry_name)
    
    # Merge dataframes on cleaned industry name
    merged_df = format_df.merge(industry_df[['Cleaned Industry Name', 'Name', 'TelNo', 'Email']],
                                on='Cleaned Industry Name', how='left')
    
    # Update the required columns if match found
    merged_df['Contact name'] = merged_df['Name'].combine_first(merged_df['Contact name'])
    merged_df['Mobile number'] = merged_df['TelNo'].combine_first(merged_df['Mobile number'])
    merged_df['Email'] = merged_df['Email_y'].combine_first(merged_df['Email_x'])
    
    # Drop unnecessary columns
    merged_df.drop(columns=['Cleaned Industry Name', 'Name', 'TelNo', 'Email_y', 'Email_x'], inplace=True, errors='ignore')
    
    # Save updated file
    merged_df.to_excel(output_file_path, index=False)
    print(f"Updated file saved as: {output_file_path}")

# Example usage:
# update_cpn("format_file.xlsx", "industry_file.xlsx", "updated_format_file.xlsx")


In [59]:
update_cpn("C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar21_PostSorted_AddingName/Mar21_Task2/Combined_Sheet_of list_of_recyclers.xlsx", "C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar21_PostSorted_AddingName/Atiq_sorted/All_industry_basic_info-MPCB_portal.xlsx", "C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar21_PostSorted_AddingName/Mar21_Task2/Updated_Combined_Sheet_Recyclers.xlsx")

Updated file saved as: C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar21_PostSorted_AddingName/Mar21_Task2/Updated_Combined_Sheet_Recyclers.xlsx
