
This function reads a CSV file and returns a pandas DataFrame.

Parameters:
file_path (str): The path to the CSV file

Returns:
pd.DataFrame: The data from the CSV file

In [None]:
import pandas as pd
import re

def read_csv_file(file_path):
    try:
        df = pd.read_csv(file_path)
        print("File read successfully!")
        return df
    except FileNotFoundError:
        print(f"No file found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
df = read_csv_file('./Assignment 0 Part A.csv')
print(df)


This function processes each row in the DataFrame. We don't find a way to process them separately, so all of extraction process are placed in this part.
    
Parameters:
df (pd.DataFrame): The DataFrame to process

In [None]:
def extract_data(df):
    
    # Fill NaN values with a default value
    df.fillna('Unknown', inplace=True)
    
    # Convert columns to string type
    df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']] = df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']].astype(str)

    for i, row in df.iterrows():
        desc = row['Description']
        # Yue Liu Part
        df.at[i, 'first_name'] = extractFirstName(desc)
        df.at[i, 'last_name'] = extractLastName(desc)
        df.at[i, 'company_name'] = extractCompanyName(desc)
        
				# Yun Cao Part
        df.at[i, 'address'] = extractAddress(desc)
        df.at[i, 'city'] = extractCity(desc)
        df.at[i, 'county'] = extractCounty(desc)
        df.at[i, 'state'] = extractState(desc)
        
				# Nan Chen Part
        # Extract phone numbers
        phones = re.findall(r'\d{3}-\d{3}-\d{4}', desc)
        if phones:
            df.at[i, 'phone1'], df.at[i, 'phone2'] = phones
        
        # Extract email
        email_match = re.search(r'(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)', desc)
        if email_match:
            df.at[i, 'email'] = email_match.group(1)

# Yue Liu Part
def extractFirstName(desc):
    result = re.search(r'\b[A-Z][a-z]*\b', desc)
    return result.group(0) if result else "Unknown"
 
def extractLastName(desc):
    result = re.search(r' ([A-Z][a-z]+):', desc)
    return result.group(1) if result else "Unknown"
 
def extractCompanyName(desc):
    result = re.search(r'\b[A-Z]\w+\s?\w+\s(?:Esq|Cpa|Service|Jr|Dimensions)\b', desc)
    return result.group(0) if result else "Unknown"

# Yun Cao Part
def extractAddress(desc):
    result = re.search(r'\d+\s[^,]+(St|Blvd|Rd|Ave|Ln|Dr)',desc)
    return result.group() if result else "None"

def extractCity(desc):
    result = re.findall(r'\b(?:of|from|in)\s+([A-Z][a-z]+)\b', desc)
    return result[0] if result else "None"

def extractCounty(desc):
    result = re.findall(r'\b[A-Z][a-z]+(?=\sCounty)\b', desc)
    return result[0] if result else "None"

def extractState(desc):
    result = re.findall(r'\b[A-Z]{2}\b',desc)
    return result[0] if result else "None"

extract_data(df)
print(df)