#### Objective
To reformat  recycler data received from MPCB into a clean, standardized tabular structure suitable for directory compilation.

#### Brief Description
This script extracts specific columns from the raw  recycler dataset based on their positional index and rearranges them into a predefined output structure.

* Each recycler is assigned a serial number.
* Industry name, district, RO/SRO, product type, capacity, phone number, and email are extracted.
* The script ensures consistency in column order and naming.
* The formatted output is saved as a new Excel file for further review or publication.

##### Ewaste

In [11]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Ewaste_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
        #"Address": row.iloc[1],            # Column B (2nd column)
        "District": row.iloc[4],           # Column C (3rd column)
        "RO": row.iloc[2],                 # Column D (4th column)
        "SRO": row.iloc[3],                # Column E (5th column)
        "Product": row.iloc[5],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[6],# Column J (10th column)
       # "Unit of Measurement": row.iloc[10], # Column K (11th column)
       # "Contact Person Name": row.iloc[5],  # Column F (6th column)
        "Phone Number": row.iloc[10],       # Column H (8th column)
        "Email": row.iloc[9],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Ewaste_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted_PWP_bracket.xlsx'.")

New Companies Columns: Index(['Authorized E-Waste Recyclers / Refurbishers / Dismantlers in Maharashtra',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted_PWP_bracket.xlsx'.


##### Ferrous

In [14]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Ferrous_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
        "Address": row.iloc[7],            # Column B (2nd column)
        "District": row.iloc[9],           # Column C (3rd column)
        #"RO": row.iloc[2],                 # Column D (4th column)
        "SRO": row.iloc[15],                # Column E (5th column)
        "Product": row.iloc[17],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[18],# Column J (10th column)
        "Unit of Measurement": row.iloc[19], # Column K (11th column)
        "Contact Person Name": row.iloc[4],  # Column F (6th column)
        "Phone Number": row.iloc[3],       # Column H (8th column)
        "Email": row.iloc[2],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Ferrous_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted.xlsx'.")

New Companies Columns: Index(['Unique id', 'IndustryName', 'Email', 'TelNo', 'Name', 'Taluka',
       'Industry category', 'Industry unit location', 'Industry unit taluka',
       'District', 'Application type', 'Category', 'Application scale',
       'Granting authority', 'Gross capital', 'Submitted to office',
       'Product category', 'ProductName', 'Total', 'Product UOM',
       'Raw material name', 'Raw Qty', 'Uom name'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted.xlsx'.


##### SpentSolvent 

In [17]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/SpentSolvent_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
        "Address": row.iloc[2],            # Column B (2nd column)
        "District": row.iloc[5],           # Column C (3rd column)
        "RO": row.iloc[3],                 # Column D (4th column)
        "SRO": row.iloc[4],                # Column E (5th column)
        "Product": row.iloc[6],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[7],# Column J (10th column)
        #"Unit of Measurement": row.iloc[19], # Column K (11th column)
        "Contact Person Name": row.iloc[11],  # Column F (6th column)
        "Phone Number": row.iloc[14],       # Column H (8th column)
        "Email": row.iloc[13],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/SpentSolvent_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted.xlsx'.")

New Companies Columns: Index(['LIST of RECYCLERS / REPROCESSORS under Rule 9 - Spent Solvent',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted.xlsx'.


##### Tyre_EPR

In [20]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Tyre_EPR_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
       # "Address": row.iloc[2],            # Column B (2nd column)
        #"District": row.iloc[5],           # Column C (3rd column)
        "RO": row.iloc[2],                 # Column D (4th column)
        #"SRO": row.iloc[4],                # Column E (5th column)
        "Product": row.iloc[3],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[4],# Column J (10th column)
        #"Unit of Measurement": row.iloc[19], # Column K (11th column)
        #"Contact Person Name": row.iloc[11],  # Column F (6th column)
        "Phone Number": row.iloc[8],       # Column H (8th column)
        "Email": row.iloc[7],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Tyre_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted.xlsx'.")

New Companies Columns: Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', ' '],
      dtype='object')
Formatted data saved as 'MPCB_Formatted.xlsx'.


##### Used Oil 

##### Modified based on multiple values sperated by space in Product and Capacity column.
##### Some values are separated by multiple spaces , hence you will see multiple blank rows in between

In [27]:
import pandas as pd

# Load the input file
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Used_Oil_MPCB.xlsx")

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    products = str(row.iloc[5]).split('\n')  # Split by newline
    capacities = str(row.iloc[6]).split('\n')  # Split by newline
    
    max_len = max(len(products), len(capacities))
    
    for i in range(max_len):
        formatted_data.append({
            "Sr. No.": index + 1,
            "Name of Industry": row.iloc[1],
            "District": row.iloc[4],
            "RO": row.iloc[2],
            "SRO": row.iloc[3],
            "Product": products[i] if i < len(products) else "",  # Handle cases where lists have different lengths
            "Capacity (Quantity)": capacities[i] if i < len(capacities) else "",
            #"Unit of Measurement": row.iloc[19], # Column K (11th column)
            "Contact Person Name": row.iloc[11],
            "Phone Number": row.iloc[12],
            "Email": row.iloc[10],
        })

# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/UsedOil_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'UsedOil_MPCB_Sorted.xlsx'.")

New Companies Columns: Index(['List of Authorised Used Oil  Reprocessors', 'Unnamed: 1', 'Unnamed: 2',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11',
       'Unnamed: 12'],
      dtype='object')
Formatted data saved as 'UsedOil_MPCB_Sorted.xlsx'.


##### Battery

In [33]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Battery/Battery_EPR_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
        #"Address": row.iloc[2],            # Column B (2nd column)
        #"District": row.iloc[5],           # Column C (3rd column)
        "RO": row.iloc[2],                 # Column D (4th column)
        "SRO": row.iloc[3],                # Column E (5th column)
        "Product": row.iloc[4],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[5],# Column J (10th column)
        #"Unit of Measurement": row.iloc[19], # Column K (11th column)
       # "Contact Person Name": row.iloc[11],  # Column F (6th column)
        "Phone Number": row.iloc[9],       # Column H (8th column)
        "Email": row.iloc[8],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/Battery_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted.xlsx'.")

New Companies Columns: Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted.xlsx'.


##### ELV 

In [36]:
import pandas as pd

# Load the input files
new_companies = pd.read_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/ELV/ELV_MPCB.xlsx")  # New companies data
#old_companies = pd.read_excel(r"C:\Users\Atique\Rutuja_Mam_Coding\Recycler_Directory\Mar19\PWP\Updated_Old_Directory_PWP.xlsx")  # Old directory data

# Print column names to verify actual column headers
print("New Companies Columns:", new_companies.columns)
#print("Old Companies Columns:", old_companies.columns)

# Create a list to store formatted data
formatted_data = []

# --- Process New Companies (Non-EPR) ---
for index, row in new_companies.iterrows():
    formatted_data.append({
        "Sr. No.": index + 1,
        "Name of Industry": row.iloc[1],   # Column A (1st column)
        "Address": row.iloc[3],            # Column B (2nd column)
        "District": row.iloc[6],           # Column C (3rd column)
        "RO": row.iloc[4],                 # Column D (4th column)
        "SRO": row.iloc[5],                # Column E (5th column)
        "Product": row.iloc[7],            # Column I (9th column)
        "Capacity (Quantity)": row.iloc[8],# Column J (10th column)
        "Unit of Measurement": row.iloc[9], # Column K (11th column)
        "Contact Person Name": row.iloc[15],  # Column F (6th column)
        "Phone Number": row.iloc[18],       # Column H (8th column)
        "Email": row.iloc[17],              # Column G (7th column)
    })


# Convert formatted data into a DataFrame
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted data to Excel
formatted_df.to_excel(r"C:/Users/Atique/Rutuja_Mam_Coding/Recycler_Directory/Mar19/ELV/ELV_MPCB_Sorted.xlsx", index=False)

print("Formatted data saved as 'MPCB_Formatted.xlsx'.")

New Companies Columns: Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18'],
      dtype='object')
Formatted data saved as 'MPCB_Formatted.xlsx'.
