# Github: [6120NLP](https://github.com/awakn123/CS6120NLP/tree/main)
Group members: Yun Cao, Yue Liu, Nan Chen

# Part A

This function reads a CSV file and returns a pandas DataFrame.

Parameters:
file_path (str): The path to the CSV file

Returns:
pd.DataFrame: The data from the CSV file

In [1]:
import pandas as pd
import re

def read_csv_file(file_path):
    try:
        df = pd.read_csv(file_path)
        print("File read successfully!")
        return df
    except FileNotFoundError:
        print(f"No file found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
df = read_csv_file('./Assignment 0 Part A.csv')
print(df)

File read successfully!
   first_name  last_name  company_name  address  city  county  state  phone1  \
0         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
1         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
2         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
3         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   
4         NaN        NaN           NaN      NaN   NaN     NaN    NaN     NaN   

   phone2  email                                        Description  
0     NaN    NaN  James Butt: An avid historian, James Butt from...  
1     NaN    NaN  Josephine Darakjy: Amidst the jazz-filled stre...  
2     NaN    NaN  Art Venere: Art Venere, a nature enthusiast at...  
3     NaN    NaN  Lenna Paprocki: While renovating their office ...  
4     NaN    NaN  Donette Foller: In the tech hub of Hamilton, D...  



This function processes each row in the DataFrame. We don't find a way to process them separately, so all of extraction process are placed in this part.
    
Parameters:
df (pd.DataFrame): The DataFrame to process

In [2]:
def extract_data(df):
    
    # Fill NaN values with a default value
    df.fillna('Unknown', inplace=True)
    
    # Convert columns to string type
    df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']] = df[['first_name', 'last_name', 'company_name', 'address', 'city', 'county', 'state', 'phone1', 'phone2', 'email']].astype(str)

    for i, row in df.iterrows():
        desc = row['Description']
        # Yue Liu Part
        df.at[i, 'first_name'] = extractFirstName(desc)
        df.at[i, 'last_name'] = extractLastName(desc)
        df.at[i, 'company_name'] = extractCompanyName(desc)
        
				# Yun Cao Part
        df.at[i, 'address'] = extractAddress(desc)
        df.at[i, 'city'] = extractCity(desc)
        df.at[i, 'county'] = extractCounty(desc)
        df.at[i, 'state'] = extractState(desc)
        
				# Nan Chen Part
        # Extract phone numbers
        phones = re.findall(r'\d{3}-\d{3}-\d{4}', desc)
        if phones:
            df.at[i, 'phone1'], df.at[i, 'phone2'] = phones
        
        # Extract email
        email_match = re.search(r'(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)', desc)
        if email_match:
            df.at[i, 'email'] = email_match.group(1)

# Yue Liu Part
def extractFirstName(desc):
    result = re.search(r'\b[A-Z][a-z]*\b', desc)
    return result.group(0) if result else "Unknown"
 
def extractLastName(desc):
    result = re.search(r' ([A-Z][a-z]+):', desc)
    return result.group(1) if result else "Unknown"
 
def extractCompanyName(desc):
    result = re.search(r'\b[A-Z]\w+\s?\w+\s(?:Esq|Cpa|Service|Jr|Dimensions)\b', desc)
    return result.group(0) if result else "Unknown"

# Yun Cao Part
def extractAddress(desc):
    result = re.search(r'\d+\s[^,]+(St|Blvd|Rd|Ave|Ln|Dr)',desc)
    return result.group() if result else "None"

def extractCity(desc):
    result = re.findall(r'\b(?:of|from|in)\s+([A-Z][a-z]+)\b', desc)
    return result[0] if result else "None"

def extractCounty(desc):
    result = re.findall(r'\b[A-Z][a-z]+(?=\sCounty)\b', desc)
    return result[0] if result else "None"

def extractState(desc):
    result = re.findall(r'\b[A-Z]{2}\b',desc)
    return result[0] if result else "None"

extract_data(df)
print(df)

  first_name last_name            company_name              address  \
0      James      Butt               John B Jr   6649 N Blue Gum St   
1  Josephine   Darakjy           Jeffrey A Esq  4 B Blue Ridge Blvd   
2        Art    Venere             James L Cpa                 None   
3      Lenna  Paprocki  Feltz Printing Service          639 Main St   
4    Donette    Foller     Printing Dimensions         34 Center St   

         city      county state        phone1        phone2  \
0      Benton     Orleans    LA  504-621-8927  504-845-1427   
1    Brighton  Livingston    MI  810-292-9388  810-374-9840   
2  Bridgeport  Gloucester  None  856-636-8749  856-264-4130   
3       Feltz        None    AK  907-385-4412  907-921-2010   
4    Hamilton        None  None       Unknown       Unknown   

                           email  \
0                jbutt@gmail.com   
1  josephine_darakjy@darakjy.org   
2                        Unknown   
3          lpaprocki@hotmail.com   
4             

  df.fillna('Unknown', inplace=True)


# Part B
List 5 book names in program.

In [3]:
book_filenames = ['./vol1.txt', './vol2.txt', './vol3.txt', './vol4.txt', './vol5.txt']

Count 1-8 by Yun Cao.

In [4]:
#1-8
counts_heart_disease = []
counts_cancer = []
counts_stroke = []
counts_respiratory = []
counts_alzheimer = []
counts_diabetes = []
counts_influenza = []
counts_kidney = []
for filename in book_filenames:
	with open(filename, 'r', encoding='utf-8') as file:
		book_text = file.read()
		# 1-8
		count_heart_disease = len(re.findall(r'(?i)heart\s*disease', book_text))
		count_cancer = len(re.findall(r'(?i)cancer', book_text))
		count_stroke = len(re.findall(r'(?i)stroke', book_text))
		count_respiratory = len(re.findall(r'(?i)respiratory', book_text))
		count_alzheimer = len(re.findall(r'(?i)(?:alzheimer|senile\s*dementia)', book_text))
		count_diabetes = len(re.findall(r'(?i)diabetes', book_text))
		count_influenza = len(re.findall(r'(?i)influenza|pneumonia', book_text))
		count_kidney = len(re.findall(r'(?i)kidney', book_text))
		counts_heart_disease.append(count_heart_disease)
		counts_cancer.append(count_cancer)
		counts_stroke.append(count_stroke)
		counts_respiratory.append(count_respiratory)
		counts_alzheimer.append(count_alzheimer)
		counts_diabetes.append(count_diabetes)
		counts_influenza.append(count_influenza)
		counts_kidney.append(count_kidney)

# Create a dataframe with the counts
df_1to8 = pd.DataFrame({
    'Book': book_filenames, 
    'Heart disease': counts_heart_disease, 
    'Cancer': counts_cancer,
    'Stroke': counts_stroke, 
    'Respiratory diseases': counts_respiratory,
    'Alzheimer\'s disease': counts_alzheimer, 
    'Diabetes': counts_diabetes,
    'Influenza and Pneumonia': counts_influenza, 
    'Kidney diseases': counts_kidney,
})


# Print the dataframe
print(df_1to8)

         Book  Heart disease  Cancer  Stroke  Respiratory diseases  \
0  ./vol1.txt              4      97       9                    79   
1  ./vol2.txt             29    1335      10                    41   
2  ./vol3.txt             70     224      12                   258   
3  ./vol4.txt             16     116       3                    11   
4  ./vol5.txt             10      41      97                    55   

   Alzheimer's disease  Diabetes  Influenza and Pneumonia  Kidney diseases  
0                    0         9                      374              182  
1                    1       227                      105              173  
2                    1         5                     1066               90  
3                    0        47                       25              680  
4                    9        33                       58               72  


Count 9-16 by Nan Chen.

In [5]:
#9-16
counts_septicemia = []
counts_liver_disease = []
counts_hypertension = []
counts_parkinsons_disease = []
counts_chronic_lower_respiratory_disease = []
counts_accidents_injuries = []
counts_osteoporosis = []
counts_asthma = []
for filename in book_filenames:
	with open(filename, 'r', encoding='utf-8') as file:
		book_text = file.read()

		#9-16
		count_septicemia = len(re.findall(r'(?i)septic', book_text))
		#count_liver_disease = len(re.findall(r'(?i)liver\s*disease', book_text))
		count_liver_disease = len(re.findall(r'(?i)liver', book_text))
		count_hypertension = len(re.findall(r'(?i)hypertension', book_text)) # Tried '(?i)high\s*blood\s*pressure', '(?i)high\s*blood', '(?i)hyper\s*tension'
		#count_parkinsons_disease = len(re.findall(r'(?i)parkinson\'s\s*disease', book_text))
		count_parkinsons_disease = len(re.findall(r'(?i)parkinson|shaking\s*palsy', book_text))
		count_chronic_lower_respiratory_disease = len(re.findall(r'(?i)chronic\s*(?:lower\s*respiratory|obstructive\s*pulmonary|bronchitis)|copd|emphysema|asthma', book_text))
		#count_accidents_injuries = len(re.findall(r'(?i)accidents\/injuries', book_text))
		count_accidents_injuries = len(re.findall(r'(?i)accidents', book_text))+len(re.findall(r'(?i)injuries', book_text))
		count_osteoporosis = len(re.findall(r'(?i)osteop|bone\s*loss|fragile\s*bones|bone\s*fragil', book_text))
		count_asthma = len(re.findall(r'(?i)asthma', book_text))
		counts_septicemia.append(count_septicemia)
		counts_liver_disease.append(count_liver_disease)
		counts_hypertension.append(count_hypertension)
		counts_parkinsons_disease.append(count_parkinsons_disease)
		counts_chronic_lower_respiratory_disease.append(count_chronic_lower_respiratory_disease)
		counts_accidents_injuries.append(count_accidents_injuries)
		counts_osteoporosis.append(count_osteoporosis)
		counts_asthma.append(count_asthma)
# Create a dataframe with the counts
df_9to16 = pd.DataFrame({
    'Septicemia': counts_septicemia,
    'Liver Disease': counts_liver_disease,
    'Hypertension': counts_hypertension,
    'Parkinson\'s Disease': counts_parkinsons_disease,
    'Chronic Lower Respiratory Disease': counts_chronic_lower_respiratory_disease,
    'Accidents/Injuries': counts_accidents_injuries,
    'Osteoporosis': counts_osteoporosis,
    'Asthma': counts_asthma,
})


# Print the dataframe
print(df_9to16)

   Septicemia  Liver Disease  Hypertension  Parkinson's Disease  \
0         510            231             0                    0   
1          81           1469             0                    0   
2         133            187             0                    0   
3         122            117             0                    0   
4          40            138             0                    5   

   Chronic Lower Respiratory Disease  Accidents/Injuries  Osteoporosis  Asthma  
0                                 32                  36             0       6  
1                                 38                  55            15      10  
2                                782                  79             0     383  
3                                 15                  56             0       1  
4                                 20                 172             4      15  


Count 17-25 by Yue Liu.

In [6]:
#17-25
counts_depression = []
counts_oral_health_issues = []
counts_hiv_aids = []
counts_tuberculosis = []
counts_malaria = []
counts_dengue_fever = []
counts_hepatitis = []
counts_epilepsy = []
counts_multiple_sclerosis = []

for filename in book_filenames:
	with open(filename, 'r', encoding='utf-8') as file:
		book_text = file.read()
		#17-25
		count_depression = len(re.findall(r'(?i)depression', book_text))
		count_oral_health_issues = len(re.findall(r'(?i)oral', book_text))
		#count_hiv_aids = len(re.findall(r'(?i)HIV/AIDS', book_text))
		count_hiv_aids = len(re.findall(r'(?i)\bHIV\b|\bAIDS\b', book_text))
		count_tuberculosis = len(re.findall(r'(?i)tuberculosis', book_text))
		count_malaria = len(re.findall(r'(?i)malaria', book_text))
		count_dengue_fever = len(re.findall(r'(?i)dengue', book_text))
		count_hepatitis = len(re.findall(r'(?i)hepatitis', book_text))
		count_epilepsy = len(re.findall(r'(?i)epilepsy', book_text))
		count_multiple_sclerosis = len(re.findall(r'(?i)disseminated\s*sclerosis', book_text))
		counts_depression.append(count_depression)
		counts_oral_health_issues.append(count_oral_health_issues)
		counts_hiv_aids.append(count_hiv_aids)
		counts_tuberculosis.append(count_tuberculosis)
		counts_malaria.append(count_malaria)
		counts_dengue_fever.append(count_dengue_fever)
		counts_hepatitis.append(count_hepatitis)
		counts_epilepsy.append(count_epilepsy)
		counts_multiple_sclerosis.append(count_multiple_sclerosis)
# Create a dataframe with the counts
df_17to25 = pd.DataFrame({
    'Book': book_filenames, 
	  'Depression': counts_depression,
    'Oral Health Issues': counts_oral_health_issues,
    'HIV/AIDS': counts_hiv_aids,
    'Tuberculosis': counts_tuberculosis,
    'Malaria': counts_malaria,
    'Dengue Fever': counts_dengue_fever,
    'Hepatitis': counts_hepatitis,
    'Epilepsy': counts_epilepsy,
    'Multiple Sclerosis': counts_multiple_sclerosis
})


# Print the dataframe
print(df_17to25)


         Book  Depression  Oral Health Issues  HIV/AIDS  Tuberculosis  \
0  ./vol1.txt          41                  79         7           125   
1  ./vol2.txt          93                  91        19           130   
2  ./vol3.txt          73                  56        12           192   
3  ./vol4.txt          27                  88         7            37   
4  ./vol5.txt         124                 460         6            36   

   Malaria  Dengue Fever  Hepatitis  Epilepsy  Multiple Sclerosis  
0      269            78          0         9                   1  
1      142             3         71        12                   2  
2       83             0          2         4                   0  
3       70             0          2        35                   0  
4       57             1         17       530                  85  


All count dataframe.

In [7]:
# Create a dataframe with the counts
df = pd.DataFrame({
    'Book': book_filenames, 
    'Heart disease': counts_heart_disease, 
    'Cancer': counts_cancer,
    'Stroke': counts_stroke, 
    'Respiratory diseases': counts_respiratory,
    'Alzheimer\'s disease': counts_alzheimer, 
    'Diabetes': counts_diabetes,
    'Influenza and Pneumonia': counts_influenza, 
    'Kidney diseases': counts_kidney,
    'Septicemia': counts_septicemia,
    'Liver Disease': counts_liver_disease,
    'Hypertension': counts_hypertension,
    'Parkinson\'s Disease': counts_parkinsons_disease,
    'Chronic Lower Respiratory Disease': counts_chronic_lower_respiratory_disease,
    'Accidents/Injuries': counts_accidents_injuries,
    'Osteoporosis': counts_osteoporosis,
    'Asthma': counts_asthma,
	  'Depression': counts_depression,
    'Oral Health Issues': counts_oral_health_issues,
    'HIV/AIDS': counts_hiv_aids,
    'Tuberculosis': counts_tuberculosis,
    'Malaria': counts_malaria,
    'Dengue Fever': counts_dengue_fever,
    'Hepatitis': counts_hepatitis,
    'Epilepsy': counts_epilepsy,
    'Multiple Sclerosis': counts_multiple_sclerosis
})


# Print the dataframe
print(df)

         Book  Heart disease  Cancer  Stroke  Respiratory diseases  \
0  ./vol1.txt              4      97       9                    79   
1  ./vol2.txt             29    1335      10                    41   
2  ./vol3.txt             70     224      12                   258   
3  ./vol4.txt             16     116       3                    11   
4  ./vol5.txt             10      41      97                    55   

   Alzheimer's disease  Diabetes  Influenza and Pneumonia  Kidney diseases  \
0                    0         9                      374              182   
1                    1       227                      105              173   
2                    1         5                     1066               90   
3                    0        47                       25              680   
4                    9        33                       58               72   

   Septicemia  ...  Asthma  Depression  Oral Health Issues  HIV/AIDS  \
0         510  ...       6          41