# To create calibration curves of each standard (HM, MeL, C, CCD8)
When running these series of commands you will be able to filter the pdfs based on their name

In [1]:
# To extract the tables from the PDFS in the iteration, pass down the START_MARKERS and END_MARKERS as props
import pandas as pd

def ExtractTable(data_array, START_MARKERS, END_MARKERS):
    for text in data_array:
        extracted_text = []

        for start_marker in START_MARKERS:
            start_index = text.find(start_marker) 

            if start_index != -1:
                for end_marker in END_MARKERS:
                    end_index = text.find(end_marker, start_index)
                    if end_index != -1:
                        extracted_text.append(text[start_index:end_index])

    return extracted_text

def CreateDataFrame(str_table):
    count = 0
    data = []
    header=[]

    for line in str_table:
        count += 1
        lines = line.strip().split('\n')
        header = lines[2].split()
        header[6] = 'Area[%]'
        data = [line.split() for line in lines[5:]]
        
    return pd.DataFrame(data, columns=header)


In [2]:
import os
import re
import PyPDF2

DIRECTORY_PATH = '../pdfs/'

pdf_files=[file for file in os.listdir(DIRECTORY_PATH) if file.endswith('.pdf')]

dataframe = pd.DataFrame(columns=['Peak', 'RetTime', 'Type', 'Width', 'Area', 'Height', 'Area[%]', 'Sample'])

for file_name in pdf_files:
    match = re.search(r'(.+ \d{4}-\d{2}-\d{2}|\d{2}-\d{2}-\d{2}) (.+?)\.pdf', file_name)
    if match:
        common_part = match.group(2)
        if any(value in common_part for value in ['HM', 'Mel', 'C', 'CCD8']):
            with open(DIRECTORY_PATH + file_name, 'rb') as pdf_file:
                # TEXT EXTRACTION FROM PDF
                pdf = PyPDF2.PdfReader(pdf_file)
                pdf_pages = [pdf.pages[i].extract_text() for i in range(1, len(pdf.pages))]

                # DATAFRAME CREATION
                start_marker = ['Signal 1:'] if 'Mel' in common_part or 'HM' in common_part else ['Signal 2:']
                str_table = ExtractTable(pdf_pages, start_marker, ['Totals :'])  
                df = CreateDataFrame(str_table)

                # SELECTING ONLY THE DESIRED ROW BY RETENTION TIME
                df['RetTime'] = pd.to_numeric(df['RetTime'], errors='coerce')
                ret_time = 1.7 if 'Mel' in common_part else (1.5 if 'HM' in common_part else(2.4 if 'CCD8' in common_part else 1.9))
                row_to_add = df[round(df['RetTime'], 1) == ret_time].copy()

                # CREATING NEW COLUMN WITH SAMPLE NAME
                common_part = common_part.replace(' X2', '').replace(' ', '').upper()
                row_to_add['Sample'] = common_part

                # CREATING MAIN DATAFRAME
                dataframe = pd.concat([dataframe, row_to_add], ignore_index=True)

                # CALCULATE THE AREA MEAN AND RETENTION TIME
                dataframe['Area'] = pd.to_numeric(dataframe['Area'], errors='coerce')
                dataframe['RetTime'] = pd.to_numeric(dataframe['RetTime'], errors='coerce')
                avg_df = dataframe.groupby('Sample').agg({'Area': 'mean', 'RetTime': 'mean'}).reset_index()


pattern = r'((?:MEL|CCD8|C|HM)?)(\d+\.\d+|\d+)\w*'

avg_df['NewSample'] = avg_df['Sample'].str.extract(pattern)[0]
avg_df['Standard'] = avg_df['Sample'].str.extract(pattern)[1]
avg_df = avg_df.drop(columns=['Sample'])

           Area  RetTime NewSample Standard
0      7.512800   1.8905         C      0.1
1     33.417930   1.8905         C      0.5
2   1706.163820   1.9040         C      100
3   1561.726440   1.9040         C      100
4    644.952270   1.8915         C       10
5     71.967610   1.8920         C        1
6    409.115050   1.9030         C       25
7    852.483340   1.9035         C       50
8      5.507960   2.4090      CCD8      0.1
9     11.354160   2.4090      CCD8      0.5
10   127.533410   2.4230      CCD8       10
11    71.066930   2.4100      CCD8        1
12   445.546140   2.4240      CCD8       25
13   839.250430   2.4220      CCD8       50
14   127.842270   2.4110      CCD8        5
15    10.434045   1.5340        HM      0.1
16    52.797825   1.5345        HM      0.5
17   842.855500   1.4990        HM       10
18   103.885330   1.5355        HM        1
19  1374.575625   1.4980        HM       20
20  4187.855715   1.5000        HM       50
21   608.764830   1.5360        

In [4]:
# Create empty DataFrames for each sample
columns = ['Area', 'RetTime', 'NewSample', 'Standard']

mel_df = pd.DataFrame(columns = columns)
hm_df = pd.DataFrame(columns = columns)
c_df = pd.DataFrame(columns = columns)
ccd8_df = pd.DataFrame(columns = columns)


# iterate through the avg_df to get a dataframe for each standard
for index, row in avg_df.iterrows():
    if row['NewSample'] == 'MEL':
        mel_df = pd.concat([mel_df, row.to_frame().T], ignore_index=True)
        mel_df['Standard'] = pd.to_numeric(mel_df['Standard'], errors='coerce')

    elif row['NewSample'] == 'HM':
        hm_df = pd.concat([hm_df, row.to_frame().T], ignore_index=True)
        hm_df['Standard'] = pd.to_numeric(hm_df['Standard'], errors='coerce')

    elif row['NewSample'] == 'C':
        c_df = pd.concat([c_df, row.to_frame().T], ignore_index=True)
        c_df['Standard'] = pd.to_numeric(c_df['Standard'], errors='coerce')

    elif row['NewSample'] == 'CCD8':
        ccd8_df = pd.concat([ccd8_df, row.to_frame().T], ignore_index=True)
        ccd8_df['Standard'] = pd.to_numeric(ccd8_df['Standard'], errors='coerce')



mel_df = mel_df.sort_values(by='Standard', ascending=True)
hm_df = hm_df.sort_values(by='Standard', ascending=True)
c_df = c_df.sort_values(by='Standard', ascending=True)
ccd8_df = ccd8_df.sort_values(by='Standard', ascending=True)

print(mel_df)

          Area RetTime NewSample  Standard
0     11.62394  1.7125       MEL       0.1
1      57.4997  1.7135       MEL       0.5
3   124.793235   1.714       MEL       1.0
6   614.177275   1.713       MEL       5.0
2   414.175875  1.7245       MEL      10.0
4   697.044675  1.7265       MEL      20.0
5  3374.338015   1.722       MEL      50.0


In [14]:
import numpy as np
import matplotlib.pyplot as plt




x = mel_df['Area']
y= mel_df['Standard']
# slope, intercept = np.polyfit(x,y)


# # Create the y = mx + b equation
# equation = f'y = {slope:.4f}x + {intercept:.4f}'

# # Create the linear regression line
# regression_line = slope * x + intercept

# # Plot the data points and the regression line
# plt.scatter(x, y, label='Data')
# plt.plot(x, regression_line, color='red', label='Regression Line')
# plt.xlabel('Area')
# plt.ylabel('Standard')
# plt.title('Linear Regression')
# plt.legend()
# plt.grid(True)

# # Show the plot
# plt.show()


# print(type(y[0]))
# slope

TypeError: _polyfit_dispatcher() missing 1 required positional argument: 'deg'