#### Step1_NewDetection&Fetching missing Details from Directory to EPR

In [3]:
import pandas as pd
import re

# -------------Part1: Detecting New Companies which are not part of Old Directory-------------
directory1 = pd.read_excel("C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/PWP_EPR_MPCB.xlsx")  # Old directory
directory2 = pd.read_excel(
    "C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/"
    "Data_MPCB_IT_UpdatedRecycler.xlsx",  
    sheet_name='O-60'  # Updated directory
)

# Function to clean company names (remove M/s variations, ignore case, strip spaces)
def clean_name(name):
    if pd.isna(name):
        return ""
    return re.sub(r"^(M/s[.:]?|M/S[.:]?|\s*M/s\s*)", "", name, flags=re.IGNORECASE).strip().lower()


directory1["Cleaned_Name"] = directory1["Company"].apply(clean_name) # Cleaning directory1
directory2["Cleaned_IndustryName"] = directory2["IndustryName"].apply(clean_name) # Cleaning directory2


existing_cleaned_names = set(directory1["Cleaned_Name"].dropna())
existing_raw_names = set(directory1["Company"].dropna().str.strip().str.lower())

# Function to check if a company exists in old directory
def is_existing_company(raw_name, cleaned_name, existing_names):
    raw_name = raw_name.strip().lower()
    return raw_name in existing_names or cleaned_name in existing_names

# Identifying new companies in the updated directory
new_data = directory2[~directory2.apply(lambda row: is_existing_company(row["IndustryName"], row["Cleaned_IndustryName"], 
                                                                        existing_cleaned_names | existing_raw_names), axis=1)]

# Selecting required columns for new companies
data_arranged = new_data[["IndustryName", "Address", "District", "RO OFFICER", "SRO OFFICER", "Name", "Email", "TelNo",
                          "ProductName", "Total", "Product UOM"]]


data_arranged.to_excel("New_Companies_PWP.xlsx", index=False)

# Part2--- Updating Old Directory with Matched Companies' Details -------------------

# Resolve duplicate company names in directory2 by keeping the first occurrence
directory2_unique = directory2.drop_duplicates(subset=["Cleaned_IndustryName"])

# Mapping RO and Name to old directory since these columns are missing in old directory.
matched_details = directory2_unique.set_index("Cleaned_IndustryName")[["RO OFFICER", "Name"]].to_dict(orient="index")

# Function to fetch RO OFFICER and Name details for matched companies
def fetch_matched_details(cleaned_name):
    return matched_details.get(cleaned_name, {"RO OFFICER": "", "Name": ""})

# Apply function to update old directory with matched details
directory1[["RO OFFICER", "Name"]] = directory1["Cleaned_Name"].apply(lambda x: pd.Series(fetch_matched_details(x)))


directory1.to_excel("Updated_Old_Directory.xlsx", index=False)

print("New companies saved in 'New_Companies_PWP.xlsx'")
print("Updated old directory saved in 'Updated_Old_Directory.xlsx'")


New companies saved in 'New_Companies_PWP.xlsx'
Updated old directory saved in 'Updated_Old_Directory.xlsx'


#### Step2: Separating Comma

In [7]:
import pandas as pd
import numpy as np


file_path = "New_Companies_PWP.xlsx"
data = pd.read_excel(file_path)

# Function to split values while handling ProductName separately
def split_row(row):
    product_names = row["ProductName"].split(",")
    totals = row["Total"].split(",")
    product_uom = row["Product UOM"].split(",")

    
    num_entries = len(totals) # Ensure 'Total' and 'Product UOM' have the same count
    
    # Adjusting 'ProductName' to match 'Total' count
    if len(product_names) > num_entries:
        # Distribute values proportionally
        product_names_split = np.array_split(product_names, num_entries)
        product_names_adjusted = [",".join(group) for group in product_names_split]
    else:
        product_names_adjusted = product_names  # Use as is if count matches
    
    # Ensure all lists have the same number of rows
    while len(product_names_adjusted) < num_entries:
        product_names_adjusted.append("")  # Fill missing ones if necessary

 
    new_rows = [] # Creating new rows equal to multiple values detected.
    for i in range(num_entries):
        new_row = row.copy()
        new_row["ProductName"] = product_names_adjusted[i]
        new_row["Total"] = totals[i]
        new_row["Product UOM"] = product_uom[i]
        new_rows.append(new_row)

    return new_rows


expanded_rows = []
for _, row in data.iterrows():
    if "," in str(row["Total"]):  #  if 'Total' has commas
        expanded_rows.extend(split_row(row))
    else:
        expanded_rows.append(row)  #  no splitting needed


expanded_data = pd.DataFrame(expanded_rows)


expanded_data.to_excel("New_Companies_PWP_Expanded.xlsx", index=False)

print("Expanded dataset saved in 'New_Companies_PWP_Expanded.xlsx'")


Expanded dataset saved in 'New_Companies_PWP_Expanded.xlsx'


#### Step3: Updating in MPCB Format

In [14]:
import pandas as pd
import re  

# Part1: Extract text inside brackets if present
def extract_bracketed_name(value):
    if isinstance(value, str):  # Ensure value is a string
        match = re.search(r"\((.*?)\)", value)  # Find text inside ()
        if match:
            return match.group(1)  # Return text inside brackets
    return value  # If no brackets, return original value


new_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\
                                New_Companies_PWP_Expanded.xlsx")  
old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\
                                Updated_Old_Directory_PWP.xlsx")  

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# Step2:  -------Process New Companies (Non-EPR) -------------
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[0],   # Column A (1st column)
        "Address": row.iloc[1],            # Column B (2nd column)
        "District": row.iloc[2],           # Column C (3rd column)
        "RO": extract_bracketed_name(row.iloc[3]),  # Extract name inside () in Column D (4th column)
        "SRO": extract_bracketed_name(row.iloc[4]), # Extract name inside () in Column E (5th column)
        "Product": row.iloc[8],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[9],# Column J (10th column)
        "Unit of Measurement": row.iloc[10], # Column K (11th column)
        "Contact Person Name": row.iloc[5],  # Column F (6th column)
        "Phone Number": row.iloc[7],       # Column H (8th column)
        "Email": row.iloc[6],              # Column G (7th column)
        "Status": "Non-EPR"  # Mark as Non-EPR for new companies
    })

# Step3: --- Process Old Companies (EPR) ---
for index, row in old_companies.iterrows():
    formatted_data.append({
        "Sr. No.": len(formatted_data) + 1,  # Continue serial numbering
        "Name of Industry": row.iloc[1],   # Column B (2nd column)
        "Address": row.iloc[2],            # Column C (3rd column)
        "District": row.iloc[4],           # Column E (5th column)
        "RO": extract_bracketed_name(row.iloc[17]),  # Extract name inside () in Column R (18th column)
        "SRO": extract_bracketed_name(row.iloc[5]),  # Extract name inside () in Column F (6th column)
        "Product": row.iloc[6],            # Column G (7th column)
        "Capacity (Quantity)": row.iloc[7],# Column H (8th column)
        "Unit of Measurement": row.iloc[12], # Column M (13th column)
        "Contact Person Name": row.iloc[18], # Column S (19th column)
        "Phone Number": row.iloc[13],      # Column N (14th column)
        "Email": row.iloc[14],             # Column O (15th column)
        "Status": "EPR"  # Mark as EPR for old companies
    })


formatted_df = pd.DataFrame(formatted_data)


formatted_df.to_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\
                        MPCB_Formatted_PWP_bracket.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted_PWP_bracket.xlsx'.")


New Companies Columns: Index(['IndustryName', 'Address', 'District', 'RO OFFICER', 'SRO OFFICER',
       'Name', 'Email', 'TelNo', 'ProductName', 'Total', 'Product UOM'],
      dtype='object')
Old Companies Columns: Index(['Sr. No.', 'Company', 'Address', 'State', 'Dist', 'SRO', 'Product',
       'Quantity (TPA)', 'QTY.(TPA)', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'UOM', 'Mob.No', 'E-mail id', 'Consent No.',
       'Cleaned_Name', 'RO OFFICER', 'Name'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted_PWP_bracket.xlsx'.
