
This function reads a CSV file and returns a pandas DataFrame.

Parameters:
file_path (str): The path to the CSV file

Returns:
pd.DataFrame: The data from the CSV file

In [2]:
import pandas as pd
import re

def read_csv_file(file_path):
    try:
        df = pd.read_csv(file_path)
        print("File read successfully!")
        return df
    except FileNotFoundError:
        print(f"No file found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
df = read_csv_file('./Assignment 0 Part A.csv')
print(df)

File read successfully!
   first_name  last_name  company_name  address  city  county  state  phone1  \
0         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
1         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
2         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
3         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
4         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   

   phone2  email                                        Description  
0     NaN    NaN  James Butt: An avid historian, James Butt from...  
1     NaN    NaN  Josephine Darakjy: Amidst the jazz-filled stre...  
2     NaN    NaN  Art Venere: Art Venere, a nature enthusiast at...  
3     NaN    NaN  Lenna Paprocki: While renovating their office ...  
4     NaN    NaN  Donette Foller: In the tech hub of Hamilton, D...  



This function processes each row in the DataFrame. We don't find a way to process them separately, so all of extraction process are placed in this part.
    
Parameters:
df (pd.DataFrame): The DataFrame to process

In [3]:
def extract_data(df):

    # Fill NaN values with a default value
    df.fillna('Unknown', inplace=True)
    
    # Convert columns to string type
    df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']] = df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']].astype(str)

    for i, row in df.iterrows():
        desc = row['Description']
        # Yue Liu Part
        df.at[i, 'first_name'] = extractFirstName(desc)
        df.at[i, 'last_name'] = extractLastName(desc)
        df.at[i, 'company_name'] = extractCompanyName(desc)
        
				# Yun Cao Part
        df.at[i, 'address'] = extractAddress(desc)
        df.at[i, 'city'] = extractCity(desc)
        df.at[i, 'county'] = extractCounty(desc)
        df.at[i, 'state'] = extractState(desc)
        
				# Nan Chen Part
        # Extract phone numbers
        phones = re.findall(r'\d{3}-\d{3}-\d{4}', desc)
        if phones:
            df.at[i, 'phone1'], df.at[i, 'phone2'] = phones
        
        # Extract email
        email_match = re.search(r'(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)', desc)
        if email_match:
            df.at[i, 'email'] = email_match.group(1)

# Yue Liu Part
def extractFirstName(desc):
    result = re.search(r'\b[A-Z][a-z]*\b', desc)
    return result.group(0) if result else "Unknown"
 
def extractLastName(desc):
    result = re.search(r' ([A-Z][a-z]+):', desc)
    return result.group(1) if result else "Unknown"
 
def extractCompanyName(desc):
    result = re.search(r'\b[A-Z]\w+\s?\w+\s(?:Esq|Cpa|Service|Jr|Dimensions)\b', desc)
    return result.group(0) if result else "Unknown"

# Yun Cao Part
def extractAddress(desc):
    result = re.search(r'\d+\s[^,]+(St|Blvd|Rd|Ave|Ln|Dr)',desc)
    return result.group() if result else "None"

def extractCity(desc):
    result = re.findall(r'\b(?:of|from|in)\s+([A-Z][a-z]+)\b', desc)
    return result[0] if result else "None"

def extractCounty(desc):
    result = re.findall(r'\b[A-Z][a-z]+(?=\sCounty)\b', desc)
    return result[0] if result else "None"

def extractState(desc):
    result = re.findall(r'\b[A-Z]{2}\b',desc)
    return result[0] if result else "None"

extract_data(df)
print(df)

  first_name last_name            company_name              address  \
0      James      Butt               John B Jr   6649 N Blue Gum St   
1  Josephine   Darakjy           Jeffrey A Esq  4 B Blue Ridge Blvd   
2        Art    Venere             James L Cpa                 None   
3      Lenna  Paprocki  Feltz Printing Service          639 Main St   
4    Donette    Foller     Printing Dimensions         34 Center St   

         city      county state        phone1        phone2  \
0      Benton     Orleans    LA  504-621-8927  504-845-1427   
1    Brighton  Livingston    MI  810-292-9388  810-374-9840   
2  Bridgeport  Gloucester  None  856-636-8749  856-264-4130   
3       Feltz        None    AK  907-385-4412  907-921-2010   
4    Hamilton        None  None       Unknown       Unknown   

                           email  \
0                jbutt@gmail.com   
1  josephine_darakjy@darakjy.org   
2                        Unknown   
3          lpaprocki@hotmail.com   
4             

  df.fillna('Unknown', inplace=True)
