# To create calibration curves of each standard (HM, MeL, C, CCD8)
When running these series of commands you will be able to filter the pdfs based on their name

In [284]:
# To extract the tables from the PDFS in the iteration, pass down the START_MARKERS and END_MARKERS as props
import pandas as pd

def ExtractTable(data_array, START_MARKERS, END_MARKERS):
    for text in data_array:
        extracted_text = []

        for start_marker in START_MARKERS:
            start_index = text.find(start_marker) 

            if start_index != -1:
                for end_marker in END_MARKERS:
                    end_index = text.find(end_marker, start_index)
                    if end_index != -1:
                        extracted_text.append(text[start_index:end_index])

    return extracted_text

def CreateDataFrame(str_table):
    count = 0
    data = []
    header=[]

    for line in str_table:
        count += 1
        lines = line.strip().split('\n')
        header = lines[2].split()
        header[6] = 'Area[%]'
        data = [line.split() for line in lines[5:]]
        
    return pd.DataFrame(data, columns=header)


In [285]:
import os
import re
import PyPDF2

DIRECTORY_PATH = '../pdfs/'

pdf_files=[file for file in os.listdir(DIRECTORY_PATH) if file.endswith('.pdf')]

dataframe = pd.DataFrame(columns=['Peak', 'RetTime', 'Type', 'Width', 'Area', 'Height', 'Area[%]', 'Sample'])

for file_name in pdf_files:
    match = re.search(r'(.+ \d{4}-\d{2}-\d{2}|\d{2}-\d{2}-\d{2}) (.+?)\.pdf', file_name)
    if match:
        common_part = match.group(2)
        if any(value in common_part for value in ['HM', 'Mel', 'C', 'CCD8']):
            with open(DIRECTORY_PATH + file_name, 'rb') as pdf_file:
                # TEXT EXTRACTION FROM PDF
                pdf = PyPDF2.PdfReader(pdf_file)
                pdf_pages = [pdf.pages[i].extract_text() for i in range(1, len(pdf.pages))]

                # DATAFRAME CREATION
                start_marker = ['Signal 1:'] if 'Mel' in common_part or 'HM' in common_part else ['Signal 2:']
                str_table = ExtractTable(pdf_pages, start_marker, ['Totals :'])  
                df = CreateDataFrame(str_table)

                # SELECTING ONLY THE DESIRED ROW BY RETENTION TIME
                df['RetTime'] = pd.to_numeric(df['RetTime'], errors='coerce')
                ret_time = 1.7 if 'Mel' in common_part else (1.5 if 'HM' in common_part else(2.4 if 'CCD8' in common_part else 1.9))
                row_to_add = df[round(df['RetTime'], 1) == ret_time].copy()

                # CREATING NEW COLUMN WITH SAMPLE NAME
                common_part = common_part.replace(' X2', '').replace(' ', '').upper()
                row_to_add['Sample'] = common_part

                # CREATING MAIN DATAFRAME
                dataframe = pd.concat([dataframe, row_to_add], ignore_index=True)

                # CALCULATE THE AREA MEAN AND RETENTION TIME
                dataframe['Area'] = pd.to_numeric(dataframe['Area'], errors='coerce')
                dataframe['RetTime'] = pd.to_numeric(dataframe['RetTime'], errors='coerce')
                avg_df = dataframe.groupby('Sample').agg({'Area': 'mean', 'RetTime': 'mean'}).reset_index()
                
print (avg_df)


        Sample         Area  RetTime
0      C0.1PPM     7.512800   1.8905
1      C0.5PPM    33.417930   1.8905
2       C100PM  1706.163820   1.9040
3      C100PPM  1561.726440   1.9040
4       C10PPM   644.952270   1.8915
5        C1PPM    71.967610   1.8920
6       C25PPM   409.115050   1.9030
7       C50PPM   852.483340   1.9035
8   CCD80.1PPM     5.507960   2.4090
9   CCD80.5PPM    11.354160   2.4090
10   CCD810PPM   127.533410   2.4230
11    CCD81PPM    71.066930   2.4100
12   CCD825PPM   445.546140   2.4240
13   CCD850PPM   839.250430   2.4220
14    CCD85PPM   127.842270   2.4110
15    HM0.1PPM    10.434045   1.5340
16    HM0.5PPM    52.797825   1.5345
17     HM10PPM   842.855500   1.4990
18      HM1PPM   103.885330   1.5355
19     HM20PPM  1374.575625   1.4980
20     HM50PPM  4187.855715   1.5000
21      HM5PPM   608.764830   1.5360
22   MEL0.1PPM    11.623940   1.7125
23   MEL0.5PPM    57.499700   1.7135
24    MEL10PPM   414.175875   1.7245
25     MEL1PPM   124.793235   1.7140
2

In [289]:

pattern = r'((?:MEL|CCD8|C|HM)?)(\d+\.\d+|\d+)\w'

avg_df['NewSample'] = avg_df['Sample'].str.extract(pattern)[0]
avg_df['Estandar'] = avg_df['Sample'].str.extract(pattern)[1]

# print(avg_df['Sample'], avg_df['Estandar'])
print(avg_df)

# for sample in samples:
#     for index, row in averages_dataframes.iterrows():
#         if sample in row['Sample']:
#             print(sample, row['Sample'])

# for index, row in avg_df.iterrows():
#     matched_samples = [sample for sample in samples if sample in row['Sample']]
#     print(matched_samples, row['Sample'])
    



        Sample         Area  RetTime Estandar NewSample
0      C0.1PPM     7.512800   1.8905      0.1         C
1      C0.5PPM    33.417930   1.8905      0.5         C
2       C100PM  1706.163820   1.9040      100         C
3      C100PPM  1561.726440   1.9040      100         C
4       C10PPM   644.952270   1.8915       10         C
5        C1PPM    71.967610   1.8920        1         C
6       C25PPM   409.115050   1.9030       25         C
7       C50PPM   852.483340   1.9035       50         C
8   CCD80.1PPM     5.507960   2.4090      0.1      CCD8
9   CCD80.5PPM    11.354160   2.4090      0.5      CCD8
10   CCD810PPM   127.533410   2.4230       10      CCD8
11    CCD81PPM    71.066930   2.4100        1      CCD8
12   CCD825PPM   445.546140   2.4240       25      CCD8
13   CCD850PPM   839.250430   2.4220       50      CCD8
14    CCD85PPM   127.842270   2.4110        5      CCD8
15    HM0.1PPM    10.434045   1.5340      0.1        HM
16    HM0.5PPM    52.797825   1.5345      0.5   