In [1]:
# -*- coding: UTF-8 -*-
from glob import glob
import sys
import numpy as np
import pandas as pd
from scipy.stats import t
import tabula

In [2]:
def read_table_from_pdf(pdf_name: str) -> pd.DataFrame:
    """Read a PDF File. Get DataFrame with XRD data to process. It is the 1st table in the list of parsed tables"""
    dfs = tabula.io.read_pdf(pdf_name, pages='1', pandas_options={'header': None})
    xrd_data= dfs[0]
    return xrd_data

In [3]:
def read_sample_name_from_pdf(pdf_name: str) -> tuple[str]:
    """Read a PDF File to get the sample data and sample name."""
    dfs = tabula.io.read_pdf(pdf_name, pages='2', lattice=True, pandas_options={'header': None})
    sample_data, sample_name = dfs[1].at[2, 0], dfs[1].at[3, 0]
    return sample_data, sample_name

In [4]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    """Clean data in DataFrame. Represent useful data of XRD measurement as '2Theta', 'd', 'I/I1', 'FWHM', 'Integrated Int'."""
    df.drop(labels=[0, 1, 2, 3, 4, 5], axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.drop(columns=[0, 2], axis=1, inplace=True)
    df.columns = ['2Theta', 'd', 'I/I1', 'FWHM', 'Integrated Int']
    df = df.astype(float, errors = 'raise')
    # discard low intensity data in DataFrame
    df = df[df['I/I1'] > 10]
    return df

In [23]:
def get_user_input(k=0.94, l=1.54056) -> tuple:
    """Get user input
    :param k - is Scherrer's  constant. k varies from 0.68 to 2.08. k = 0.94 for spherical crystallites with cubic symmetry
    :param l - is X-ray wavelength"""
    question = input('Do you want to use the default values of k=0.94 (for spherical crystallites with cubic symmetry) and λ=1.54056 Å? Enter y/n.')
    if question.lower() in ('n', 'no'):
        try:
            k = float(input("Enter K - Scherrer's  constant. K varies from 0.68 to 2.08. K = 0.94 for spherical crystallites with cubic symmetry"))
            l = float(input('Enter λ - X-ray wavelength.'))
        except ValueError:
            print("Please enter the values with dot only e.g. 0.94 or 1.54056.")
    if question.lower() not in ('yes', 'no', 'y', 'n'):
        print('Please enter y/n.')
    print(f'The program is using the following values: k={k} and λ={l} Å,')
    return k, l

In [13]:
def calc_size(df: pd.DataFrame, k: float, l:float) -> pd.DataFrame:
    """Calculate the particle_size based on the data in dataframe"""
    radian = df['2Theta'] * np.pi / 360
    df['particle_size, nm'] = (k * l) / (df['FWHM'] * np.pi * 10 * np.cos(radian) /180)
    return df

In [14]:
def calc_mean_size(df: pd.DataFrame) -> float:
    """Calculate the mean size of the particles based on the data in dataframe"""
    mean_size= df['particle_size, nm'].mean().round()
    return mean_size

In [15]:
def calc_error(df: pd.DataFrame, alpha=0.05) -> float:
    """Calculate the measurement error of the particle size based on data in dataframe
    Alpha is significance level = 5% by default, n - the number of measurements i.e. points"""
    # Get the number of measurements i.e. points
    n = len(df['particle_size, nm'])
    # Calculate student t value
    v = t.ppf(1 - alpha/2, (n-1))
    # Calculate the error confidence interval
    error = df['particle_size, nm'].std() * v / np.sqrt(n)
    return error.round()

In [88]:
# def create_blank_csv():
#     """ Create blank csv file for storing data"""
#     df_empty = pd.DataFrame({'sample_name' : [], 'sample_data' : [], 'mean_size, nm' : [], 'error, nm' : []})
#     df_empty.to_csv('sherrer_size_data.csv')
#     return df_empty

In [92]:
def write_data_in_file(sample_name, sample_data, mean_size, error):
    # Export data in csv file
    # df = pd.read.csv('sherrer_size_data.csv')
    dict_ = {'sample_name' : [sample_name], 'sample_data' : [sample_data], 'mean_size, nm' : [mean_size], 'error, nm' : [error]}
    df = pd.DataFrame(dict_)
    df.to_csv('sherrer_size_data.csv', mode='w')

In [93]:
def main(file_name: str):
    """Main function"""
    df = read_table_from_pdf(file_name)
    sample_data, sample_name = read_sample_name_from_pdf(file_name)
    print(sample_data, sample_name)
    df = clean_df(df)
    print(f'Data after cleaning: \n{df}')
    k, l = get_user_input()
    df = calc_size(df, k, l)
    mean_size, error = calc_mean_size(df), calc_error(df)
    print(f'The average crystallite size of the sample is {mean_size} nm, the error confidence interval is {error} nm\n')
    write_data_in_file(sample_name, sample_data, mean_size, error)



In [94]:
if __name__ == '__main__':
    file_names = list(glob('*.pdf'))
    if not file_names:
        file_names = sys.argv[1:]
    for name in file_names:
        main(file_name=name)


Data                : 8717 Sample Nmae         : Bi1.5 Sc0.5 O7_650_6h
Data after cleaning: 
     2Theta        d   I/I1    FWHM  Integrated Int
1   14.8695  5.95300   23.0  0.2621      11507438.0
2   28.6914  3.10891   18.0  0.2516       9146779.0
3   30.0005  2.97616  100.0  0.2587     497230830.0
5   34.7713  2.57796   26.0  0.2798      13149738.0
6   38.0086  2.36550   17.0  0.2952       8456391.0
8   49.9674  1.82380   24.0  0.3729     121110823.0
10  59.3395  1.55616   16.0  0.3906       7986966.0
The program is using the following values: k=0.94 and λ=1.54056 Å,
The average crystallite size of the sample is 30.0 nm, the error confidence interval is 4.0 nm

Data                : 9991 Sample Nmae         : KBi12TiO20_600_5h
Data after cleaning: 
     2Theta        d   I/I1    FWHM  Integrated Int
1   24.7040  3.60093   21.0  0.1520    1.267746e+09
3   27.6776  3.22044  100.0  0.1507    6.057520e+10
5   30.3825  2.93961   23.0  0.1649    1.368653e+09
6   32.8854  2.72137   68.0  0.