In [24]:
# -*- coding: UTF-8 -*-
"""Automatic XRD crystallite (grain) size calculator (Scherrer Equation)"""

from glob import glob
import sys
import numpy as np
import pandas as pd
from scipy.stats import t
import tabula

In [25]:
def read_table_from_pdf(pdf_name: str) -> pd.DataFrame:
    """Read a PDF File. Get DataFrame with XRD data to process.
    It is the 1st table in the list of parsed tables"""
    dfs = tabula.io.read_pdf(pdf_name, pages='1', pandas_options={'header': None})
    xrd_data = dfs[0]
    return xrd_data

In [26]:
def read_sample_name_from_pdf(pdf_name: str) -> tuple[str]:
    """Read a PDF File to get the sample data and sample name."""
    dfs = tabula.io.read_pdf(pdf_name, pages='2', lattice=True, pandas_options={'header': None})
    sample_data = dfs[1].at[2, 0].split(':')[1].strip()
    sample_name = dfs[1].at[3, 0].split(':')[1].strip()
    return sample_data, sample_name

In [28]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    """Clean data in DataFrame.
    Represent useful data of XRD measurement as '2Theta', 'd', 'I/I1', 'FWHM', 'Integrated Int'."""
    df.drop(labels=[0, 1, 2, 3, 4, 5], axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.drop(columns=[0, 2], axis=1, inplace=True)
    df.columns = ['2Theta', 'd', 'I/I1', 'FWHM', 'Integrated Int']
    df = df.astype(float)
    # discard low intensity data in DataFrame
    df = df[df['I/I1'] > 10]
    return df

In [29]:
def get_user_input(k=0.94, l=1.54056) -> tuple:
    """Get user input
    :param k - is Scherrer's  constant.
    k varies from 0.68 to 2.08. k = 0.94 for spherical crystallites with cubic symmetry
    :param l - is X-ray wavelength"""
    question = input(f'Do you want to use the default values of k={k} (for spherical crystallites with cubic symmetry) and λ=1.54056 Å? Enter y/n.')
    if question.lower() in ('n', 'no'):
        try:
            k = float(input("Enter K - Scherrer's  constant. K varies from 0.68 to 2.08. K = 0.94 for spherical crystallites with cubic symmetry"))
            l = float(input('Enter λ - X-ray wavelength.'))
        except ValueError:
            print("Please enter the values with dot only e.g. 0.94 or 1.54056.")
    if question.lower() not in ('yes', 'no', 'y', 'n'):
        print('Please enter y/n.')
    print(f'The program is using the following values: k={k} and λ={l} Å,')
    return k, l

In [30]:
def calc_size(df: pd.DataFrame, k: float, l: float) -> pd.DataFrame:
    """Calculate the particle_size based on the data in dataframe"""
    radian = df['2Theta'] * np.pi / 360
    df['particle_size, nm'] = (k * l) / (df['FWHM'] * np.pi * 10 * np.cos(radian) / 180)
    return df

In [31]:
def calc_mean_size(df: pd.DataFrame) -> float:
    """Calculate the mean size of the particles based on the data in dataframe"""
    mean_size = df['particle_size, nm'].mean().round()
    return mean_size

In [32]:
def calc_error(df: pd.DataFrame, alpha=0.05) -> float:
    """Calculate the measurement error of the particle size based on data in dataframe
    Alpha is significance level = 5% by default, n - the number of measurements i.e. points"""
    # Get the number of measurements i.e. points
    n = len(df['particle_size, nm'])
    # Calculate student t value
    v = t.ppf(1 - alpha/2, (n-1))
    # Calculate the error confidence interval
    error = df['particle_size, nm'].std() * v / np.sqrt(n)
    return error.round()

In [33]:
def write_data_in_file(sample_name, sample_data, mean_size, error):
    """Export data in csv file"""
    dict_ = {'sample_name': [sample_name],
             'sample_data': [sample_data],
             'mean_size, nm': [mean_size],
             'error, nm': [error],
             }
    df = pd.DataFrame(dict_)
    if glob('sherrer_size_data.csv'):
        mode = 'a'
        header = False
    else:
        mode = 'w'
        header = list(dict_.keys())
    with open('sherrer_size_data.csv', mode=mode, encoding='utf-8') as f:
        df.to_csv(f, header=header, index=False)

In [34]:
def main(file_name: str):
    """Main function"""
    df = read_table_from_pdf(file_name)
    sample_data, sample_name = read_sample_name_from_pdf(file_name)
    df = clean_df(df)
    print(f'Data after cleaning: \n{df}')
    k, l = get_user_input()
    df = calc_size(df, k, l)
    mean_size, error = calc_mean_size(df), calc_error(df)
    print(f'The average crystallite size of the sample {file_name} is {mean_size} nm, '
          f'the error confidence interval is {error} nm\n')
    write_data_in_file(sample_name, sample_data, mean_size, error)

In [35]:
if __name__ == '__main__':
    file_names = sorted(glob('*.pdf'))
    if not file_names:
        file_names = sys.argv[1:]
    for name in file_names:
        main(file_name=name)

Data after cleaning: 
     2Theta        d   I/I1    FWHM  Integrated Int
3   23.0149  3.86124   14.0  0.6249    1.802276e+08
6   29.3000  3.04570   12.0  0.4902    1.556215e+08
7   29.9970  2.97650  100.0  0.4265    1.256313e+10
8   32.6475  2.74065   40.0  0.3895    5.040527e+08
11  39.5013  2.27948   19.0  0.4795    2.395330e+08
12  46.8842  1.93629   17.0  0.5310    2.181268e+08
18  56.7009  1.62215   22.0  0.5621    2.776375e+08
The program is using the following values: k=0.89 and λ=1.54056 Å,
The average crystallite size of the sample OCR_calc/10010.pdf is 17.0 nm, the error confidence interval is 2.0 nm

Data after cleaning: 
     2Theta        d   I/I1    FWHM  Integrated Int
2   23.1266  3.84285   17.0  0.5721    2.520390e+08
4   29.9916  2.97703  100.0  0.5324    1.518622e+10
5   32.7655  2.73105   51.0  0.3748    7.675810e+08
7   39.6745  2.26993   23.0  0.4044    3.509330e+08
9   47.1061  1.92769   21.0  0.5140    3.257305e+08
14  56.9295  1.61618   22.0  0.6186    3.29955