In [1]:
!pip install pdfplumber



In [2]:
import pandas as pd
import re
import pdfplumber

def extract_chromatogram_peaks(text):
    """
    Extract chromatogram peak data from PDF text.
    
    Parameters:
    -----------
    text : str
        Full text from PDF
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with Peak, Start, RT, End, Height, Area, Area%, SNR
    """
    peaks = []
    lines = text.split('\n')
    
    in_peak_section = False
    
    for line in lines:
        if "Chromatogram Peaks" in line:
            in_peak_section = True
            continue
        
        if in_peak_section:
            # Match peak data: Peak Start RT End Height Area Area% SNR
            match = re.match(
                r'^(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+'
                r'([\d.]+)\s+([\d.]+)\s+([\d.]+)',
                line
            )
            
            if match:
                peaks.append({
                    'Peak': int(match.group(1)),
                    'Peak_Start': float(match.group(2)),
                    'Peak_RT': float(match.group(3)),
                    'Peak_End': float(match.group(4)),
                    'Height': float(match.group(5)),
                    'Area': float(match.group(6)),
                    'Area_Pct': float(match.group(7))
                })
            
            # Stop when we hit another section
            elif line.strip() and peaks and not line.startswith('Peak'):
                if 'Sample Spectra' in line or 'Spectrum Peaks' in line:
                    break
    
    return pd.DataFrame(peaks)


def match_compounds_to_peaks(compounds_df, peaks_df, rt_tolerance=0.0001):
    """
    Match compounds to chromatogram peaks based on RT values.
    
    Parameters:
    -----------
    compounds_df : pd.DataFrame
        Compounds dataframe
    peaks_df : pd.DataFrame
        Chromatogram peaks dataframe
    rt_tolerance : float
        RT tolerance for matching in minutes (default 0.5 min)
        
    Returns:
    --------
    pd.DataFrame
        Compounds with matched area data
    """
    compounds_with_area = compounds_df.copy()
    compounds_with_area['Area'] = None
    compounds_with_area['Area_Pct'] = None
    compounds_with_area['Peak_Height'] = None
    
    for idx, compound in compounds_with_area.iterrows():
        compound_rt = compound['RT']
        
        # Find peaks within RT tolerance
        matching_peaks = peaks_df[
            (peaks_df['Peak_RT'] >= compound_rt - rt_tolerance) &
            (peaks_df['Peak_RT'] <= compound_rt + rt_tolerance)
        ]
        
        if len(matching_peaks) > 0:
            # If multiple matches, take the closest one
            matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
            best_match = matching_peaks.loc[matching_peaks['RT_Diff'].idxmin()]
            
            compounds_with_area.at[idx, 'Area'] = best_match['Area']
            compounds_with_area.at[idx, 'Area_Pct'] = best_match['Area_Pct']
            compounds_with_area.at[idx, 'Peak_Height'] = best_match['Height']
    
    return compounds_with_area


def extract_compound_summary(pdf_path, rt_tolerance=0.0001):
    """
    Extract Compound Summary table from MassHunter Analysis Report PDF
    and cross-reference with chromatogram peak areas.
    
    Parameters:
    -----------
    pdf_path : str
        Path to the PDF file
    rt_tolerance : float
        RT tolerance for matching compounds to peaks (minutes)
        
    Returns:
    --------
    pd.DataFrame
        DataFrame containing compound data with matched peak areas
    """
    
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        compounds = []
        
        for page in pdf.pages:
            text = page.extract_text()
            full_text += text + "\n"
            
            # Find the Compound Summary section
            if "Compound Summary" in text:
                lines = text.split('\n')
                
                # Find where compound data starts
                start_idx = None
                for i, line in enumerate(lines):
                    if re.match(r'^\d+\s+C\d+\s+H\d+', line):
                        start_idx = i
                        break
                
                if start_idx:
                    # Parse each compound line
                    for line in lines[start_idx:]:
                        # Match pattern: number, formula, RT, Mass, optional CAS, Source, Scores
                        match = re.match(
                            r'^(\d+)\s+(C\d+\s+H\d+(?:\s+[A-Z][a-z]?\d*)*)\s+'
                            r'([\d.]+)\s+([\d.]+)\s+'
                            r'(?:([\d-]+)\s+)?'  # Optional CAS ID
                            r'(\w+)\s+([\d.]+)\s+(\w+)',
                            line
                        )
                        
                        if match:
                            cpd_num = int(match.group(1))
                            formula = match.group(2).replace(' ', '')
                            rt = float(match.group(3))
                            mass = float(match.group(4))
                            cas_id = match.group(5) if match.group(5) else None
                            source = match.group(6)
                            score = float(match.group(7))
                            algorithm = match.group(8)
                            
                            compounds.append({
                                'Cpd': cpd_num,
                                'Formula': formula,
                                'RT': rt,
                                'Mass': mass,
                                'CAS_ID': cas_id,
                                'Source': source,
                                'Score': score,
                                'Algorithm': algorithm
                            })
    
    compounds_df = pd.DataFrame(compounds)
    
    # Extract chromatogram peaks
    peaks_df = extract_chromatogram_peaks(full_text)
    
    # Match compounds to peaks
    if len(peaks_df) > 0:
        compounds_with_area = match_compounds_to_peaks(
            compounds_df, peaks_df, rt_tolerance
        )
        
        print(f"Extracted {len(compounds_df)} compounds")
        print(f"Found {len(peaks_df)} chromatogram peaks")
        print(f"Matched {compounds_with_area['Area'].notna().sum()} compounds to peaks")
        
        return compounds_with_area
    else:
        print(f"Warning: No chromatogram peaks found in {pdf_path}")
        compounds_df['Area'] = None
        compounds_df['Area_Pct'] = None
        compounds_df['Peak_Height'] = None
        return compounds_df


In [3]:
import pandas as pd
from pathlib import Path

# Assuming you have extract_compound_summary defined/imported
# e.g., from your_module import extract_compound_summary

# Path to the folder containing PDFs (including subdirectories)
pdf_folder = Path('/Users/yitong/Documents/GitHub/shell_diaries/TRYING_OUT_MS_RESULTS')  

# Find all PDF files recursively in the folder
pdf_files = list(pdf_folder.rglob('*.pdf'))

all_in_one_place = []

# Loop through each PDF file
for pdf_file in pdf_files:
    print(f"\nProcessing {pdf_file}...")
    
    # Extract compound summary
    df = extract_compound_summary(str(pdf_file))
    
    # Display some results
    print(df.head(10))
    print(f"Total compounds extracted: {len(df)}")
    
    print("\nFormula value counts:")
    print(df['Formula'].value_counts().head())
    
    print("\nRT statistics:")
    print(df['RT'].describe())
    
    # Filter for rows with peak area
    has_peak_df = df[~df['Area'].isna()]
    has_peak_df_sorted = has_peak_df.sort_values(by='Area_Pct', ascending=False)
    print(f"After filtering for areas : {len(has_peak_df)}")
    print(has_peak_df_sorted.head())

    all_in_one_place.append((pdf_file, df, has_peak_df_sorted))
    


Processing /Users/yitong/Documents/GitHub/shell_diaries/TRYING_OUT_MS_RESULTS/70425_shell2_Analysis.pdf...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 78 compounds
Found 47 chromatogram peaks
Matched 14 compounds to peaks
   Cpd     Formula      RT      Mass CAS_ID Source  Score Algorithm  \
0    1  C22H20N2O7   7.270  424.1280   None    FBF  81.73       FBF   
1    2   C21H21NO6   9.921  383.1337   None    FBF  59.97       FBF   
2    3  C20H24N2O2   4.307  324.1822   None    FBF  57.62       FBF   
3    4   C21H23NO5   7.841  369.1596   None    FBF  87.44       FBF   
4    5    C10H15NO   0.384  165.1149   None    FBF  78.64       FBF   
5    6  C21H24N2O2   7.062  336.1817   None    FBF  55.11       FBF   
6    7    C18H20N2   0.436  264.1634   None    FBF  77.75       FBF   
7    8   C12H16N2O   7.945  204.1268   None    FBF  51.72       FBF   
8    9    C26H44N2  13.456  384.3498   None    FBF  61.71       FBF   
9   10   C18H35NO2   6.516  297.2664   None    FBF  98.96       FBF   

         Area Area_Pct Peak_Height  
0   4167504.0     1.52    891900.0  
1        None     None        None  
2        None     None    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 5266 compounds
Found 7 chromatogram peaks
Matched 105 compounds to peaks
   Cpd       Formula      RT      Mass CAS_ID Source  Score Algorithm  Area  \
0    1    C22H20N2O7   7.218  424.1302   None    FBF  58.53       FBF  None   
1    2     C21H23NO5   7.764  369.1602   None    FBF  66.88       FBF  None   
2    3   C46H54N4O10  12.833  822.3791   None    FBF  53.21       FBF  None   
3    4    C20H26N2O4  17.766  358.1913   None    FBF  69.19       FBF  None   
4    5      C20H33N3  11.481  315.2659   None    FBF  89.17       FBF  None   
5    6    C27H48N2O2  19.792  432.3754   None    FBF  53.46       FBF  None   
6    7   C18H20FN3O4  12.625  361.1438   None    FBF  58.66       FBF  None   
7    8    C25H20N4O2   7.218  408.1557   None    FBF  74.00       FBF  None   
8    9  C16H13ClN2O2   5.712  300.0677   None    FBF  65.28       FBF  None   
9   10     C37H50N2O  19.403  538.3917   None    FBF  51.28       FBF  None   

  Area_Pct Peak_Height  
0     None        None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 8346 compounds
Found 47 chromatogram peaks
Matched 597 compounds to peaks
   Cpd     Formula      RT      Mass CAS_ID Source  Score Algorithm  \
0    1  C22H20N2O7   7.270  424.1280   None    FBF  81.73       FBF   
1    2   C21H21NO6   9.921  383.1337   None    FBF  59.97       FBF   
2    3  C20H24N2O2   4.307  324.1822   None    FBF  57.62       FBF   
3    4   C21H23NO5   7.841  369.1596   None    FBF  87.44       FBF   
4    5    C10H15NO   0.384  165.1149   None    FBF  78.64       FBF   
5    6  C21H24N2O2   7.062  336.1817   None    FBF  55.11       FBF   
6    7    C18H20N2   0.436  264.1634   None    FBF  77.75       FBF   
7    8   C12H16N2O   7.945  204.1268   None    FBF  51.72       FBF   
8    9    C26H44N2  13.456  384.3498   None    FBF  61.71       FBF   
9   10   C18H35NO2   6.516  297.2664   None    FBF  98.96       FBF   

         Area Area_Pct Peak_Height  
0   4167504.0     1.52    891900.0  
1        None     None        None  
2        None     None 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 5266 compounds
Found 7 chromatogram peaks
Matched 105 compounds to peaks
   Cpd       Formula      RT      Mass CAS_ID Source  Score Algorithm  Area  \
0    1    C22H20N2O7   7.218  424.1302   None    FBF  58.53       FBF  None   
1    2     C21H23NO5   7.764  369.1602   None    FBF  66.88       FBF  None   
2    3   C46H54N4O10  12.833  822.3791   None    FBF  53.21       FBF  None   
3    4    C20H26N2O4  17.766  358.1913   None    FBF  69.19       FBF  None   
4    5      C20H33N3  11.481  315.2659   None    FBF  89.17       FBF  None   
5    6    C27H48N2O2  19.792  432.3754   None    FBF  53.46       FBF  None   
6    7   C18H20FN3O4  12.625  361.1438   None    FBF  58.66       FBF  None   
7    8    C25H20N4O2   7.218  408.1557   None    FBF  74.00       FBF  None   
8    9  C16H13ClN2O2   5.712  300.0677   None    FBF  65.28       FBF  None   
9   10     C37H50N2O  19.403  538.3917   None    FBF  51.28       FBF  None   

  Area_Pct Peak_Height  
0     None        None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 7752 compounds
Found 41 chromatogram peaks
Matched 562 compounds to peaks
   Cpd     Formula     RT      Mass CAS_ID Source  Score Algorithm  \
0    1  C22H20N2O7  7.244  424.1283   None    FBF  80.84       FBF   
1    2    C16H18N2  6.621  238.1489   None    FBF  59.55       FBF   
2    3   C21H23NO5  7.790  369.1593   None    FBF  85.46       FBF   
3    4    C10H15NO  4.411  165.1141   None    FBF  62.69       FBF   
4    5  C21H24N2O2  4.437  336.1859   None    FBF  63.83       FBF   
5    6    C18H20N2  4.281  264.1609   None    FBF  61.60       FBF   
6    7  C15H22N2O2  3.943  262.1680   None    FBF  59.15       FBF   
7    8    C14H8N2O  2.305  220.0624   None    FBF  60.43       FBF   
8    9   C18H35NO2  6.491  297.2665   None    FBF  96.14       FBF   
9   10      C8H17N  0.409  127.1362   None    FBF  86.17       FBF   

         Area Area_Pct Peak_Height  
0   1655591.0      1.2    404359.0  
1        None     None        None  
2        None     None        None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] - compound_rt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_peaks['RT_Diff'] = abs(matching_peaks['Peak_RT'] -

Extracted 7752 compounds
Found 41 chromatogram peaks
Matched 562 compounds to peaks
   Cpd     Formula     RT      Mass CAS_ID Source  Score Algorithm  \
0    1  C22H20N2O7  7.244  424.1283   None    FBF  80.84       FBF   
1    2    C16H18N2  6.621  238.1489   None    FBF  59.55       FBF   
2    3   C21H23NO5  7.790  369.1593   None    FBF  85.46       FBF   
3    4    C10H15NO  4.411  165.1141   None    FBF  62.69       FBF   
4    5  C21H24N2O2  4.437  336.1859   None    FBF  63.83       FBF   
5    6    C18H20N2  4.281  264.1609   None    FBF  61.60       FBF   
6    7  C15H22N2O2  3.943  262.1680   None    FBF  59.15       FBF   
7    8    C14H8N2O  2.305  220.0624   None    FBF  60.43       FBF   
8    9   C18H35NO2  6.491  297.2665   None    FBF  96.14       FBF   
9   10      C8H17N  0.409  127.1362   None    FBF  86.17       FBF   

         Area Area_Pct Peak_Height  
0   1655591.0      1.2    404359.0  
1        None     None        None  
2        None     None        None

In [4]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:

# Method 1: Direct from PDF
df = extract_compound_summary('MS_RESULTS/TINY_shell_Analysis.pdf')

# Display results
print(df.head(10))
print(f"\nTotal compounds extracted: {len(df)}")

# Save to CSV
df.to_csv('compound_summary.csv', index=False)

# Optional: Additional analysis
print("\nFormula value counts:")
print(df['Formula'].value_counts().head())

print("\nRT statistics:")
print(df['RT'].describe())

has_peak_df = df[~df['Area'].isna()]
has_peak_df
has_peak_df.sort_values(by='Area_Pct',ascending=False)


In [None]:
# Method 1: Direct from PDF
df = extract_compound_summary('MS_RESULTS/TINY_shell_Analysis.pdf')

# Display results
print(df.head(10))
print(f"\nTotal compounds extracted: {len(df)}")

# Save to CSV
df.to_csv('compound_summary.csv', index=False)

# Optional: Additional analysis
print("\nFormula value counts:")
print(df['Formula'].value_counts().head())

print("\nRT statistics:")
print(df['RT'].describe())

has_peak_df = df[~df['Area'].isna()]
has_peak_df
has_peak_df.sort_values(by='Area_Pct',ascending=False)


KeyboardInterrupt: 

In [None]:
has_peak_df = df[~df['Area'].isna()]
has_peak_df
has_peak_df.sort_values(by='Area_Pct',ascending=False)

Unnamed: 0,Cpd,Formula,RT,Mass,CAS_ID,Source,Score,Algorithm,Area,Area_Pct,Peak_Height
824,827,C24H43N3O4,3.267,437.3214,,FBF,55.54,FBF,137818488.0,100.0,8741437.0
5689,5692,C12H10O4,3.267,218.0567,,FBF,71.58,FBF,137818488.0,100.0,8741437.0
5733,5736,C9H16ClN3O2,3.267,233.0922,,FBF,50.22,FBF,137818488.0,100.0,8741437.0
5541,5544,C16H28N4O7,3.007,388.1948,,FBF,93.81,FBF,135352463.0,98.21,8155244.0
5476,5479,C19H28N4O5,3.007,392.2022,,FBF,65.66,FBF,135352463.0,98.21,8155244.0
...,...,...,...,...,...,...,...,...,...,...,...
225,226,C21H24O8,7.244,404.1475,,FBF,94.95,FBF,1655591.0,1.2,404359.0
5320,5323,C40H53N7O5S2,7.244,775.3564,,FBF,70.65,FBF,1655591.0,1.2,404359.0
5339,5342,C20H29N7O4,7.244,431.2305,,FBF,86.31,FBF,1655591.0,1.2,404359.0
5353,5356,C14H26N4O6,7.244,346.1823,,FBF,69.61,FBF,1655591.0,1.2,404359.0


In [None]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
# import pandas as pd
# import re
# import pdfplumber

# def extract_compound_summary(pdf_path):
#     """
#     Extract Compound Summary table from MassHunter Analysis Report PDF.
    
#     Parameters:
#     -----------
#     pdf_path : str
#         Path to the PDF file
        
#     Returns:
#     --------
#     pd.DataFrame
#         DataFrame containing the compound summary data
#     """
    
#     # Method 1: Using pdfplumber (recommended for tables)
#     with pdfplumber.open(pdf_path) as pdf:
#         compounds = []
        
#         for page in pdf.pages:
#             text = page.extract_text()
            
#             # Find the Compound Summary section
#             if "Compound Summary" in text:
#                 lines = text.split('\n')
                
#                 # Find where compound data starts
#                 start_idx = None
#                 for i, line in enumerate(lines):
#                     if re.match(r'^\d+\s+C\d+\s+H\d+', line):
#                         start_idx = i
#                         break
                
#                 if start_idx:
#                     # Parse each compound line
#                     for line in lines[start_idx:]:
#                         # Match pattern: number, formula, RT, Mass, optional CAS, Source, Scores
#                         match = re.match(
#                             r'^(\d+)\s+(C\d+\s+H\d+(?:\s+[A-Z][a-z]?\d*)*)\s+'
#                             r'([\d.]+)\s+([\d.]+)\s+'
#                             r'(?:([\d-]+)\s+)?'  # Optional CAS ID
#                             r'(\w+)\s+([\d.]+)\s+(\w+)',
#                             line
#                         )
                        
#                         if match:
#                             cpd_num = int(match.group(1))
#                             formula = match.group(2).replace(' ', '')
#                             rt = float(match.group(3))
#                             mass = float(match.group(4))
#                             cas_id = match.group(5) if match.group(5) else None
#                             source = match.group(6)
#                             score = float(match.group(7))
#                             algorithm = match.group(8)
                            
#                             compounds.append({
#                                 'Cpd': cpd_num,
#                                 'Formula': formula,
#                                 'RT': rt,
#                                 'Mass': mass,
#                                 'CAS_ID': cas_id,
#                                 'Source': source,
#                                 'Score': score,
#                                 'Algorithm': algorithm
#                             })
    
#     return pd.DataFrame(compounds)


# def extract_from_text(text_content):
#     """
#     Alternative method: Extract from already extracted text content.
#     Use this if you already have the PDF text extracted.
    
#     Parameters:
#     -----------
#     text_content : str
#         Text content from the PDF
        
#     Returns:
#     --------
#     pd.DataFrame
#         DataFrame containing the compound summary data
#     """
    
#     compounds = []
#     lines = text_content.split('\n')
    
#     # Find the Compound Summary section
#     in_compound_section = False
    
#     for line in lines:
#         if "Compound Summary" in line:
#             in_compound_section = True
#             continue
        
#         if in_compound_section:
#             # Match compound data lines
#             match = re.match(
#                 r'^(\d+)\s+(C\d+\s+H\d+(?:\s+[A-Z][a-z]?\d*)*)\s+'
#                 r'([\d.]+)\s+([\d.]+)\s+'
#                 r'(?:([\d-]+)\s+)?'  # Optional CAS ID
#                 r'(\w+)\s+([\d.]+)\s+(\w+)',
#                 line
#             )
            
#             if match:
#                 compounds.append({
#                     'Cpd': int(match.group(1)),
#                     'Formula': match.group(2).replace(' ', ''),
#                     'RT': float(match.group(3)),
#                     'Mass': float(match.group(4)),
#                     'CAS_ID': match.group(5),
#                     'Source': match.group(6),
#                     'Score': float(match.group(7)),
#                     'Algorithm': match.group(8)
#                 })
            
#             # Stop if we hit another section
#             elif line.strip() and not line.startswith('Cpd'):
#                 if compounds:  # Only stop if we've found some compounds
#                     break
    
#     return pd.DataFrame(compounds)



In [None]:
df = extract_compound_summary('/Users/yitong/Documents/GitHub/shell_diaries/MS_RESULTS/BIGMED_shell_Analysis.pdf')
df

Unnamed: 0,Cpd,Formula,RT,Mass,CAS_ID,Source,Score,Algorithm
0,1,C20H26N2O2,21.951,326.1965,,FBF,55.97,FBF
1,2,C21H23NO5,7.814,369.1589,,FBF,76.93,FBF
2,3,C15H25NO4,10.803,283.1766,,FBF,80.38,FBF
3,4,C14H8N2O2,0.933,236.0609,,FBF,59.77,FBF
4,5,C35H40N4O3,18.131,564.3063,,FBF,63.21,FBF
...,...,...,...,...,...,...,...,...
4766,4769,C29H52O8,10.258,528.3640,,FBF,63.56,FBF
4767,4770,C29H44O,14.313,408.3407,,FBF,56.25,FBF
4768,4771,C29H42O,17.093,406.3275,,FBF,51.19,FBF
4769,4772,C29H42O2,9.140,422.3217,,FBF,59.13,FBF


In [None]:

# Example usage:
if __name__ == "__main__":
    # Method 1: Direct from PDF
    df = extract_compound_summary('/Users/yitong/Documents/GitHub/shell_diaries/MS_RESULTS/BIGMED_shell_Analysis.pdf')
    
    # Method 2: From your provided text
    text = """
    [Your PDF text content here]
    """
    
    df = extract_from_text(text)
    
    # Display results
    print(df.head(10))
    print(f"\nTotal compounds extracted: {len(df)}")
    
    # Save to CSV
    df.to_csv('compound_summary.csv', index=False)
    
    # Optional: Additional analysis
    print("\nFormula value counts:")
    print(df['Formula'].value_counts().head())
    
    print("\nRT statistics:")
    print(df['RT'].describe())