In [1]:
import re
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('PolygonParser.csv')

In [3]:
#Create Columns
df['Polygon'] = pd.NA
df['Project Number'] = pd.NA
df['Work Order'] = pd.NA

In [4]:
def extract_project_number(filename):
    # Define the regular expression pattern to search for "RO####-###" in the 'FILENAME'.
    pattern = r'R0\d{4}-\d{3}'
    
    # Use the findall function to extract all occurrences of the pattern in the 'FILENAME'.
    matches = re.findall(pattern, filename)
    
    # If there are matches, return the first occurrence (assuming there's only one per filename).
    # If there are no matches, return None.
    return matches[0] if matches else None


In [5]:
def extract_work_order(filename):
    # Define the regular expression pattern to search for the work order in the 'FILENAME'.
    pattern = r'\(EO\d{4,5}\)'
    
    # Use the findall function to extract all occurrences of the pattern in the 'FILENAME'.
    matches = re.findall(pattern, filename)
    
    # If there are matches, return the first occurrence (assuming there's only one work order per filename).
    # If there are no matches, return None.
    return matches[0] if matches else None

In [6]:
def extract_polygon(filename):
    # Define the first regular expression pattern to search for the polygon in the 'FILENAME'.
    # The pattern captures everything up to 'CAB' followed by an optional hyphen and a number.
    pattern1 = r'^(.*?CAB(?:\s?-?\s?\d+))'
    
    # Use the findall function to extract all occurrences of the first pattern in the 'FILENAME'.
    matches1 = re.findall(pattern1, filename)
    
    if matches1:
        # If there are matches for the first pattern, return the first occurrence.
        return matches1[0].strip()
    else:
        # Define the second regular expression pattern to search for the polygon in the 'FILENAME'.
        # The second pattern captures everything up to "R0" without including "R0".
        pattern2 = r'^(.*?)(?:\sR0|$)'
        
        # Use the findall function to extract all occurrences of the second pattern in the 'FILENAME'.
        matches2 = re.findall(pattern2, filename)
        
        if matches2:
            # If there are matches for the second pattern, check if Polygon is the original string.
            # If Polygon is the original string, apply a third pattern to capture everything between two periods.
            if filename == matches2[0].strip():
                # Define the third regular expression pattern to capture everything between two periods.
                pattern3 = r'^(?:[^.]*\.)(.*?)(?:\.[^.]*|$)'
                
                # Use the findall function to extract all occurrences of the third pattern in the 'FILENAME'.
                matches3 = re.findall(pattern3, filename)
                
                # If there are matches for the third pattern, return the first occurrence.
                # If there are no matches for the third pattern, return None.
                return matches3[0].strip() if matches3 else None
            else:
                # If Polygon is not the original string, return whatever was captured by the second pattern.
                return matches2[0].strip()
        else:
            # If there are no matches for the second pattern, check for the fourth pattern.
            # The fourth pattern captures everything up to "UNVERIFIED ADDRESS LIST".
            pattern4 = r'^(.*?)(?:\s-?\s?UNVERIFIED ADDRESS LIST|$)'
            
            # Use the findall function to extract all occurrences of the fourth pattern in the 'FILENAME'.
            matches4 = re.findall(pattern4, filename)
            
            # If there are matches for the fourth pattern, return the first occurrence.
            # If there are no matches for any of the patterns, return None.
            return matches4[0].strip() if matches4 else None


In [7]:
def update_polygon(row):
    if row['Project Number'] and row['Polygon']:
        if row['Project Number'] in row['Polygon']:
            row['Polygon'] = row['Polygon'].replace(row['Project Number'], '').strip().strip('.')
    return row

In [8]:
def update_polygon_with_regex(row):
    # Get the 'FILENAME' from the row
    filename = row['FILENAME']
    
    # Define the regular expression pattern to capture everything up until the first dot '.'
    pattern = r'^(.*?)\.'
    
    # Use regex to extract the desired substring from 'FILENAME'
    match = re.match(pattern, filename)
    
    # If the regex matched, update the 'Polygon' column with the extracted substring
    if match:
        row['Polygon'] = match.group(1).strip()
    
    return row

In [9]:
def update_polygon_with_unverified(row):
    if row['Polygon']:
        # Remove 'UNVERIFIED ADDRESS LIST' along with '-' from the 'Polygon' column
        row['Polygon'] = row['Polygon'].replace('UNVERIFIED ADDRESS LIST -', '').strip()
        row['Polygon'] = row['Polygon'].replace('UNVERIFIED ADDRESS LIST', '').strip()
    return row

In [10]:
def update_polygon_remove_copy_of(row):
    if row['FILENAME']:
        # Check if the 'Polygon' column contains the substring 'Copy of'
        if 'Copy of' in row['FILENAME']:
            # Remove 'Copy of' from the 'Polygon' column
            row['FILENAME'] = row['FILENAME'].replace('Copy of', '').strip()
    return row

In [11]:
def remove_project_number_from_filename(row):
    # Get the 'Project Number' from the row
    project_number = row['Project Number']
    
    # Check if the 'Project Number' is not NaN
    if pd.notna(project_number):
        # Get the 'FILENAME' from the row
        filename = row['FILENAME']
        
        # Remove the 'Project Number' from the 'FILENAME' if it exists
        row['FILENAME'] = filename.replace(project_number, '').strip()
    
    return row

In [12]:
df = df.apply(update_polygon_remove_copy_of, axis=1)
# Now, let's apply the 'extract_project_number' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Project Number' column.
df['Project Number'] = df['FILENAME'].apply(extract_project_number)
# Now, let's apply the 'extract_work_order' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Work Order' column.
df['Work Order'] = df['FILENAME'].apply(extract_work_order)
# Now, let's apply the 'extract_polygon' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Polygon' column.
df['Polygon'] = df['FILENAME'].apply(extract_polygon)

df = df.apply(update_polygon, axis=1)
df = df.apply(update_polygon_with_unverified, axis=1)
# Print the updated DataFrame
print(df)

                                             FILENAME  \
0   BURLINGTON ACM_02004 FDH0028 CAB 2 R01941-020 ...   
1   BURLINGTON ACM_2006_FDH0038 CAB-2 R01941-023 (...   
2   Burlington ACM_02004_FDH0023 R01941-026 (438FS...   
3   Burlington ACM_02004_FDH0031 CAB-1 - UNVERIFIE...   
4   Burlington ACM_02004_FDH0031 CAB-2 - UNVERIFIE...   
..                                                ...   
82  R01941-116 MEBANE 85 CABINET 1.(109HTC_LCC) - ...   
83  R01942-013.GOLDSBORO LCP #13B.(1015AS_LCC) - U...   
84  UNVERIFIED ADDRESS LIST BURLINGTON ACM 02006 F...   
85  UNVERIFIED ADDRESS LIST GLEN RAVEN ACM 03007 F...   
86  UNVERIFIED ADDRESS LIST R01941 019 BURLINGTON ...   

                                Polygon Project Number Work Order  
0    BURLINGTON ACM_02004 FDH0028 CAB 2     R01941-020       None  
1     BURLINGTON ACM_2006_FDH0038 CAB-2     R01941-023       None  
2          Burlington ACM_02004_FDH0023     R01941-026       None  
3    Burlington ACM_02004_FDH0031 CAB-1    

In [13]:
# Create a copy of the DataFrame
df_remaining = df.copy()
# Drop rows where 'Polygon' column doesn't have parenthesis
df_remaining = df_remaining[df_remaining['Polygon'].str.contains(r'\(.*\)')]
print(df_remaining)

df_remaining = df_remaining.apply(remove_project_number_from_filename, axis=1)
df_remaining = df_remaining.apply(update_polygon_with_regex, axis=1)
df_remaining = df_remaining[['Polygon']]
df_remaining.set_index('Polygon', inplace=True)
merged_df = df.join(df_remaining, on='Polygon')
#print(merged_df)


                                             FILENAME  \
12  R01941-081#50 C1.(2069BBR_LLC) - unverified AD...   
13  R01941-118 Mebane 88.(615HD_LCC) - UNVERIFIED ...   
34  R01941-032 MEBANE 20 C1.(303GS_LCC) - UNVERIFI...   
66  R01941-081 MEBANE 50 CABINET 2.(1808BBR_LCC) -...   
67  R01941-117 MEBANE 87.(663RL_LCC) - UNVERIFIED ...   
82  R01941-116 MEBANE 85 CABINET 1.(109HTC_LCC) - ...   

                                    Polygon Project Number Work Order  
12  (2069BBR_LLC) - unverified ADDRESS LIST     R01941-081       None  
13                            (615HD_LCC) -     R01941-118       None  
34                     (303GS_LCC) -  (002)     R01941-032       None  
66                          (1808BBR_LCC) -     R01941-081       None  
67                            (663RL_LCC) -     R01941-117       None  
82                           (109HTC_LCC) -     R01941-116       None  


In [14]:
# Create a copy of the DataFrame 'df'.
#df_copy = df.copy()

# Keep only the rows in the copied DataFrame 'df_copy' where the 'Polygon' column is None.
#df_copy = df_copy[df_copy['Polygon'].isnull()]

# Reset the index of 'df_copy' to ensure a consistent index order.
#df_copy.reset_index(drop=True, inplace=True)

# Check the modified DataFrame 'df_copy' with rows filtered.
#print(df_copy)

In [15]:
#print(df)

merged_df.to_csv('Polygon1_Output.csv', index=False)
#complete_values.to_csv('PolygonParser_Complete.csv', index=False)