In [10]:
import re
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('PolygonParser.csv')

In [12]:
#Create Columns

df['Polygon'] = pd.NA
df['Project Number'] = pd.NA
df['Work Order'] = pd.NA

In [13]:
def extract_project_number(filename):
    # Define the regular expression pattern to search for "RO####-###" in the 'FILENAME'.
    pattern = r'R0\d{4}-\d{3}'
    
    # Use the findall function to extract all occurrences of the pattern in the 'FILENAME'.
    matches = re.findall(pattern, filename)
    
    # If there are matches, return the first occurrence (assuming there's only one per filename).
    # If there are no matches, return None.
    return matches[0] if matches else None


In [14]:
def extract_work_order(filename):
    # Define the regular expression pattern to search for the work order in the 'FILENAME'.
    pattern = r'\(EO\d{4,5}\)'
    
    # Use the findall function to extract all occurrences of the pattern in the 'FILENAME'.
    matches = re.findall(pattern, filename)
    
    # If there are matches, return the first occurrence (assuming there's only one work order per filename).
    # If there are no matches, return None.
    return matches[0] if matches else None

In [19]:
def extract_polygon(filename):
    # Define the first regular expression pattern to search for the polygon in the 'FILENAME'.
    # The pattern captures everything up to 'CAB' followed by an optional hyphen and a number.
    pattern1 = r'^(.*?CAB(?:\s?-?\s?\d+))'
    
    # Use the findall function to extract all occurrences of the first pattern in the 'FILENAME'.
    matches1 = re.findall(pattern1, filename)
    
    if matches1:
        # If there are matches for the first pattern, return the first occurrence.
        return matches1[0].strip()
    else:
        # If there are no matches for the first pattern, use a second pattern.
        # The second pattern captures everything up to "R0" without including "R0".
        pattern2 = r'^(.*?)(?:\sR0|$)'
        
        # Use the findall function to extract all occurrences of the second pattern in the 'FILENAME'.
        matches2 = re.findall(pattern2, filename)
        
        # If there are matches for the second pattern, return the first occurrence.
        # If there are no matches for the second pattern as well, return None.
        return matches2[0].strip() if matches2 else None


In [20]:
# Now, let's apply the 'extract_project_number' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Project Number' column.
df['Project Number'] = df['FILENAME'].apply(extract_project_number)
# Now, let's apply the 'extract_work_order' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Work Order' column.
df['Work Order'] = df['FILENAME'].apply(extract_work_order)
# Now, let's apply the 'extract_polygon' function to the 'FILENAME' column in the DataFrame 'df' to create the new 'Polygon' column.
df['Polygon'] = df['FILENAME'].apply(extract_polygon)

df.head()

Unnamed: 0,FILENAME,Polygon,Project Number,Work Order
0,BURLINGTON ACM_02004 FDH0028 CAB 2 R01941-020 ...,BURLINGTON ACM_02004 FDH0028 CAB 2,R01941-020,
1,BURLINGTON ACM_2006_FDH0038 CAB-2 R01941-023 (...,BURLINGTON ACM_2006_FDH0038 CAB-2,R01941-023,
2,Burlington ACM_02004_FDH0023 R01941-026 (438FS...,Burlington ACM_02004_FDH0023,R01941-026,
3,Burlington ACM_02004_FDH0031 CAB-1 - UNVERIFIE...,Burlington ACM_02004_FDH0031 CAB-1,,
4,Burlington ACM_02004_FDH0031 CAB-2 - UNVERIFIE...,Burlington ACM_02004_FDH0031 CAB-2,,


Unnamed: 0,FILENAME,Polygon,Project Number,Work Order
0,BURLINGTON ACM_02004 FDH0028 CAB 2 R01941-020 ...,BURLINGTON ACM_02004 FDH0028 CAB 2,R01941-020,
1,BURLINGTON ACM_2006_FDH0038 CAB-2 R01941-023 (...,BURLINGTON ACM_2006_FDH0038 CAB-2,R01941-023,
2,Burlington ACM_02004_FDH0023 R01941-026 (438FS...,Burlington ACM_02004_FDH0023,R01941-026,
3,Burlington ACM_02004_FDH0031 CAB-1 - UNVERIFIE...,Burlington ACM_02004_FDH0031 CAB-1,,
4,Burlington ACM_02004_FDH0031 CAB-2 - UNVERIFIE...,Burlington ACM_02004_FDH0031 CAB-2,,


In [21]:
# Create a copy of the DataFrame 'df'.
df_copy = df.copy()

# Keep only the rows in the copied DataFrame 'df_copy' where the 'Polygon' column is None.
df_copy = df_copy[df_copy['Polygon'].isnull()]

# Reset the index of 'df_copy' to ensure a consistent index order.
df_copy.reset_index(drop=True, inplace=True)

# Check the modified DataFrame 'df_copy' with rows filtered.
print(df_copy)

Empty DataFrame
Columns: [FILENAME, Polygon, Project Number, Work Order]
Index: []
Empty DataFrame
Columns: [FILENAME, Polygon, Project Number, Work Order]
Index: []


In [22]:
print(df)

df.to_csv('PolygonParser_Output.csv', index=False)
#complete_values.to_csv('PolygonParser_Complete.csv', index=False)

                                             FILENAME  \
0   BURLINGTON ACM_02004 FDH0028 CAB 2 R01941-020 ...   
1   BURLINGTON ACM_2006_FDH0038 CAB-2 R01941-023 (...   
2   Burlington ACM_02004_FDH0023 R01941-026 (438FS...   
3   Burlington ACM_02004_FDH0031 CAB-1 - UNVERIFIE...   
4   Burlington ACM_02004_FDH0031 CAB-2 - UNVERIFIE...   
..                                                ...   
82  R01941-116 MEBANE 85 CABINET 1.(109HTC_LCC) - ...   
83  R01942-013.GOLDSBORO LCP #13B.(1015AS_LCC) - U...   
84  UNVERIFIED ADDRESS LIST BURLINGTON ACM 02006 F...   
85  UNVERIFIED ADDRESS LIST GLEN RAVEN ACM 03007 F...   
86  UNVERIFIED ADDRESS LIST R01941 019 BURLINGTON ...   

                                              Polygon Project Number  \
0                  BURLINGTON ACM_02004 FDH0028 CAB 2     R01941-020   
1                   BURLINGTON ACM_2006_FDH0038 CAB-2     R01941-023   
2                        Burlington ACM_02004_FDH0023     R01941-026   
3                  Burlingt