In [1]:
# %%
import csv

def parse_pis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file if line.strip()]  # Remove empty lines

    pis = []
    # Split lines into blocks at 'PI'
    blocks = []
    current_block = []
    for line in lines:
        if line == 'PI':
            if current_block:
                blocks.append(current_block)
                current_block = []
        else:
            current_block.append(line)
    if current_block:
        blocks.append(current_block)

    # Define identifiers to skip (optional)
    identifiers = {'Core/Dual', 'Dual/Core', 'Core', 'Dual'}

    for block in blocks:
        pi = {
            'Principal Investigator': '',
            'Email': '',
            'Phone': '',
            'Room No': '',
            'Research Interests': ''
        }
        i = 0
        # Check for identifier
        if i < len(block) and block[i] in identifiers:
            i += 1
        # Name
        if i < len(block):
            pi['Principal Investigator'] = block[i]
            i += 1
        # Position (e.g., Professor, Associate Professor, etc.) - skipped or stored if needed
        if i < len(block):
            position = block[i]
            i += 1
        # Name repeated - skipped
        if i < len(block):
            repeated_name = block[i]
            i += 1
        # Parse contact info
        while i < len(block) and block[i] in {'e', 'p', 'r'}:
            key = block[i]
            i += 1
            if i < len(block):
                value = block[i]
                if key == 'e':
                    pi['Email'] = value
                elif key == 'p':
                    pi['Phone'] = value
                elif key == 'r':
                    pi['Room No'] = value
                i += 1
        # Parse Research Interests if available
        if i < len(block):
            # Ensure that the next line is not a new 'PI'
            if block[i] != 'PI':
                pi['Research Interests'] = block[i]
                i += 1
        pis.append(pi)

    return pis

def write_csv(pis, output_file):
    fieldnames = ['Principal Investigator', 'Email', 'Phone', 'Room No', 'Research Interests']
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for pi in pis:
            writer.writerow(pi)

if __name__ == "__main__":
    input_file = 'mit_csail_pis.txt'  # Replace with your actual file path
    output_csv = 'mit_csail_pis.csv'
    pis = parse_pis(input_file)
    write_csv(pis, output_csv)
    print(f"CSV file '{output_csv}' has been created successfully with {len(pis)} entries.")


CSV file 'mit_csail_pis.csv' has been created successfully with 131 entries.
