# Regex

In [1]:
#load the comments_table.pickle
import pickle
from pathlib import Path
import pandas as pd

table_name = "full_comments_renamed.pickle"
data_dir = Path('../Data')
file_path = data_dir / table_name

with open(file=file_path, mode="rb") as f:
    data = pickle.load(f)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,ProductID,FullComment
0,10001,Warning: Unindexed pattern. Unit Cell Data Sou...
1,10002,Warning: Unindexed pattern. Unit Cell Data Sou...
2,10003,Warning: Unindexed pattern. Unit Cell Data Sou...
3,10004,Warning: Unindexed pattern. Unit Cell Data Sou...
4,10005,Warning: Unindexed pattern. Unit Cell Data Sou...


In [2]:
df.iloc[-1].to_dict()

{'ProductID': '050104886',
 'FullComment': 'Color: colourless. General Comments: Reported GOF=0.954. Habit: Prismatic. Structures: 10.1107/S2056989019007989. Article Title: Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate. Temperature of Data Collection: Pattern taken at 296 K. Unit Cell Data Source: Single Crystal.'}

In [5]:
import re

def parse_full_comment(full_comment):
    # Matches key: value. where value can contain periods
    pattern = r'([A-Za-z\s]+):\s*(.*?)(?=\. (?=[A-Za-z\s]+:)|\.$)'
    
    matches = re.findall(pattern, full_comment)
    
    return {k.strip(): v.strip().rstrip('.') for k, v in matches}


# Example usage:
full_comment = "Color: colourless. General Comments: Reported GOF=0.954. Habit: Prismatic. Structures: 10.1107/S2056989019007989. Article Title: Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate. Temperature of Data Collection: Pattern taken at 296 K. Unit Cell Data Source: Single Crystal."
parsed_data = parse_full_comment(full_comment)
print(parsed_data)


{'Color': 'colourless', 'General Comments': 'Reported GOF=0.954', 'Habit': 'Prismatic', 'Structures': '10.1107/S2056989019007989', 'Article Title': 'Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate', 'Temperature of Data Collection': 'Pattern taken at 296 K', 'Unit Cell Data Source': 'Single Crystal'}


In [6]:
df_test = df.sample(n=10000, random_state=42)

# Apply the parser to each row in the FullComment column
parsed_df = df_test["FullComment"].apply(parse_full_comment)

# Turn the Series of dicts into a DataFrame
parsed_df = pd.DataFrame(parsed_df.tolist(), index=df_test.index)

# Combine original df with the new parsed columns
df_combined = pd.concat([df_test, parsed_df], axis=1)


In [15]:
print(len(df_combined.columns))
for col in df_combined.columns:
    if len(col)>40:
        print(col)

139
This is not a subgroup of the garnet structure
In each complex unit the two mixed metal sites are occupied by Pr
The solid separated was purified by column chromatography using hexane
Same calculation with PBE functional revised for solids
Phase III of Ti Ni is made up itself of three phases
Contains two phases with identical powder patterns


In [16]:
# improve the parsing function

import re

def parse_full_comment(full_comment):
    # Pattern for key-value pairs
    pattern = r'([A-Za-z\s]+):\s*(.*?)(?=\. (?=[A-Za-z\s]+:)|\.$)'

    matches = re.findall(pattern, full_comment)

    # Only keep matches where all words in key start with uppercase letters
    def is_valid_key(key):
        return all(word.istitle() for word in key.split())

    parsed = {k.strip(): v.strip().rstrip('.') for k, v in matches if is_valid_key(k.strip())}

    return parsed


In [17]:
df_test = df.sample(n=10000, random_state=42)

# Apply the parser to each row in the FullComment column
parsed_df = df_test["FullComment"].apply(parse_full_comment)

# Turn the Series of dicts into a DataFrame
parsed_df = pd.DataFrame(parsed_df.tolist(), index=df_test.index)

# Combine original df with the new parsed columns
df_combined = pd.concat([df_test, parsed_df], axis=1)

In [19]:
print(len(df_combined.columns))
for col in df_combined.columns:
    if len(col)>40:
        print(col)
print(df_combined.columns)

56
Index(['ProductID', 'FullComment', 'Color', 'Unit Cell Data Source',
       'Melting Point', 'Structures', 'Note',
       'Calculated Pattern Original Remarks', 'Analysis', 'Article Title',
       'Compound Preparation', 'Hypothetical Structure', 'Unit Cell',
       'Chemically Related', 'Optical Data', 'Bioactivity',
       'Additional Patterns', 'Sensitivity', 'Isomorphism',
       'In Situ Condition', 'Vickers Hardness Number',
       'Absolute Configuration', 'Processing Information', 'Refractive Index',
       'Powder Data', 'Temperature Factor', 'Delete', 'Raw Data Comment',
       'Reflectance', 'Data Collection', 'D', 'Other Cell', 'Functional',
       'Part I', 'Dy', 'Tm', 'Atomic Position', 'B', 'Water Loss',
       'Boiling Point', 'E', 'Atomic Coordinates', 'Search Algorithm'],
      dtype='object')


In [20]:
# from tqdm.notebook import tqdm
# tqdm.pandas()
# import time

# # Apply the parser to each row in the FullComment column
# parsed_dicts = df["FullComment"].progress_apply(parse_full_comment)
# print(f"Parsing completed!")

# with tqdm(total=2, desc="Combining DataFrames") as pbar:
#     parsed_df = pd.DataFrame(parsed_dicts.tolist(), index=df.index)
#     pbar.update(1)
#     df_combined = pd.concat([df, parsed_df], axis=1)
#     pbar.update(1)


In [21]:
from tqdm.notebook import tqdm
import time

tqdm.pandas(desc="Parsing")

start = time.time()
parsed_dicts = df_test["FullComment"].progress_apply(parse_full_comment)
parsed_dicts = parsed_dicts.copy()  # Force materialization
print(f"✅ Parsing completed in {time.time() - start:.2f} seconds")

start = time.time()
parsed_df = pd.DataFrame(parsed_dicts.tolist(), index=df_test.index)
print(f"✅ Dict-to-DataFrame took {time.time() - start:.2f} seconds")

start = time.time()
df_combined = pd.concat([df_test, parsed_df], axis=1)
print(f"✅ Concatenation took {time.time() - start:.2f} seconds")


Parsing:   0%|          | 0/10000 [00:00<?, ?it/s]

✅ Parsing completed in 0.15 seconds
✅ Dict-to-DataFrame took 0.02 seconds
✅ Concatenation took 0.01 seconds


In [7]:
len(df)*0.02/3600

6.134094444444445

## Parallelized version

In [22]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import time

start = time.time()
# Clean input
comments = df_test["FullComment"].tolist()

# Optional: redefine safe parser (just in case)
def parse_full_comment_safe(comment):
    try:
        return parse_full_comment(comment)
    except Exception:
        return {}

# Run in parallel using threads
n_workers = 8  # adjust based on your CPU

with ThreadPoolExecutor(max_workers=n_workers) as executor:
    parsed_rows = list(tqdm(executor.map(parse_full_comment_safe, comments), total=len(comments), desc="Threaded parsing"))

# Convert to DataFrame
parsed_df = pd.DataFrame(parsed_rows, index=df_test.index)

# Combine with original
df_combined = pd.concat([df_test, parsed_df], axis=1)

end = time.time()
print(f"Processing time: {end - start:.2f} sec / {(end - start)/3600}")



Threaded parsing:   0%|          | 0/10000 [00:00<?, ?it/s]

Processing time: 0.24 sec / 6.687084833780925e-05


In [23]:
estimated_time = len(df)/len(df_test)*(end - start)
print(f"Estimated processing time: {estimated_time} sec")

Estimated processing time: 26.58044803361893 sec


In [24]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import time

start = time.time()
# Clean input
comments = df["FullComment"].tolist()

# Optional: redefine safe parser (just in case)
def parse_full_comment_safe(comment):
    try:
        return parse_full_comment(comment)
    except Exception:
        return {}

# Run in parallel using threads
n_workers = 16  # adjust based on your CPU

with ThreadPoolExecutor(max_workers=n_workers) as executor:
    parsed_rows = list(tqdm(executor.map(parse_full_comment_safe, comments), total=len(comments), desc="Threaded parsing"))

# Convert to DataFrame
parsed_df = pd.DataFrame(parsed_rows, index=df.index)

# Combine with original
df_combined = pd.concat([df, parsed_df], axis=1)

end = time.time()
print(f"Processing time: {end - start:.2f} sec / {(end - start)/3600}")

Threaded parsing:   0%|          | 0/1104137 [00:00<?, ?it/s]

Processing time: 31.78 sec / 0.008828652501106262


In [26]:
df_combined.columns, len(df_combined.columns)

        'General Comments', 'Reason O Quality Was Assigned',
        'Deleted Or Rejected By', 'Color', 'Melting Point', 'Water Loss',
        ...
        'X', 'Phasons', 'Homologous Series', 'Calc', 'Preparation Reference',
        'Procedure', 'Solution', 'Slurry', 'Kneading', 'Solution B'],
       dtype='object', length=228),
 228)

In [29]:
print(len(df_combined.columns))
for col in df_combined.columns:
    if len(col)>20:
        print(col)
print(df_combined.columns)

228
Unit Cell Data Source
Reason O Quality Was Assigned
Deleted Or Rejected By
Additional Diffraction Lines
Vickers Hardness Number
Authors Phase Designation
Parabromochlorobenzene
Parachloroiodobenzene
Processing Information
Calculated Pattern Original Remarks
Hypothetical Structure
Absolute Configuration
Terbium Manganese Iron Antimony Tellurium
Drei Schichtstrukturen
Tungsten Tetrachloride
Monopotassium Pentachromium Octaselenide
Dissociation Pressure Measurements
Preparation Reference
       'General Comments', 'Reason O Quality Was Assigned',
       'Deleted Or Rejected By', 'Color', 'Melting Point', 'Water Loss',
       ...
       'X', 'Phasons', 'Homologous Series', 'Calc', 'Preparation Reference',
       'Procedure', 'Solution', 'Slurry', 'Kneading', 'Solution B'],
      dtype='object', length=228)


In [33]:
df[df["FullComment"].str.contains("Monopotassium Pentachromium Octaselenide", na=False)].to_dict()


{'ProductID': {850469: '040093825'},

In [34]:
df[df["FullComment"].str.contains("Terbium Manganese Iron Antimony Tellurium", na=False)].to_dict()

{'ProductID': {124949: '000730928'},
 'FullComment': {124949: 'Analysis: An INCA-Energy-350 X-ray EDS spectrometer (Oxford Instruments) on the Jeol JSM-6480LV scanning electron microscope (20 kV accelerating voltage, 0.7 nA beam current and 0.050 mm beam diameter) was employed for the quantitative microprobe analyses. Signals averaged over four points per phase had estimated standard deviations of 0.5 at.% for Tb (measured by L-series lines), 0.8 at.% for Mn and Fe and 0.6% for Sb and Te (measured by K-series lines). Tb 66.9, Mn 2.2, Fe 8.3, Sb 12.1, Te 10.1. Color: Metallic whitish gray. General Comments: Tb6FeTe2- and Tb6FeSb2-based solid solution. Terbium Manganese Iron Antimony Tellurium: Fe2P-type (K2UF6-type) Tb6Mn0.25Fe0.75SbTe rare earth intermetallic compound (space group P-62m, N 189, hP9) (Tb6FeSb2- and Tb6FeTe2-based solid solution). Processing Information: Rietveld refinement. Physical property: The alloy is brittle. The visible oxidation of surface was observed for a few 

In [35]:
from pathlib import Path

output_path = Path("../Data/full_comments_parsed.pickle")

# save using pickle

with open(output_path, 'wb') as f:
    pickle.dump(df_combined, f)

print(f"Pickle file saved in {output_path}")
    

Pickle file saved in ../Data/full_comments_parsed.pickle


In [42]:
# test loading

with open(output_path, 'rb') as f:
    data = pickle.load(f)

df_parsed = pd.DataFrame(data)

df_sampled = df_parsed.sample(n=10, random_state=42)

In [43]:
df_sampled.iloc[4].dropna().to_dict()

{'ProductID': '020708054',
 'FullComment': 'CSD-refcode: NUTPEX. Formula from original source: C18 H15 N1 O4. Color: colorless. Habit: block. Melting Point: 458K. Note: Atomic coordinates were obtained by IUCr CIF collection. Calculated Pattern Original Remarks: SOURCE-CHEM: acetone/petroleum ether. Temperature of Data Collection: at 180 K. Unit Cell Data Source: Single Crystal.',
 'Unit Cell Data Source': 'Single Crystal',
 'Color': 'colorless',
 'Melting Point': '458K',
 'Note': 'Atomic coordinates were obtained by IUCr CIF collection',
 'Habit': 'block',
 'Calculated Pattern Original Remarks': 'SOURCE-CHEM: acetone/petroleum ether'}

# Convert to SQlite database

In [47]:
import sqlite3
from pathlib import Path

database_name = "comments.db"
data_dir = Path("../Data")

database_path = data_dir / database_name

# connect to database
conn=sqlite3.connect(database_path)

# Write the DataFrame to a table named "my_table"
df_combined.to_sql("comments", conn, if_exists="replace", index=False)

# Close connection
conn.close()


In [50]:
# Read the database for test
conn = sqlite3.connect(database_path)
df_loaded = pd.read_sql("SELECT * FROM comments", conn)
conn.close()

print(df_loaded)


         ProductID                                        FullComment  \
...            ...                                                ...   
1104132  050104882  Color: colourless. General Comments: Reported ...   
1104133  050104883  Color: yellow. General Comments: Reported GOF=...   
1104134  050104884  Color: brown. General Comments: Reported GOF=1...   
1104135  050104885  Color: dark blue. General Comments: Reported G...   
1104136  050104886  Color: colourless. General Comments: Reported ...   

0        Unindexed pattern    Powder Diffraction                None   
1        Unindexed pattern    Powder Diffraction                None   
2        Unindexed pattern    Powder Diffraction                None   
3        Unindexed pattern    Powder Diffraction                None   
4        Unindexed pattern    Powder Diffraction                None   
...                    ...                   ...                 ...   
1104132               None        Single Crystal  Report

In [52]:
# 2nd method
from pyprojroot import here
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine
db_path = str(here("Data")) + "/comments_2.db"
db_path = f"sqlite:///{db_path}"

engine = create_engine(db_path)
df_combined.to_sql("comments_2", engine, index=False)



1104137

In [54]:
df_combined.to_csv("../Data/comments.csv")

In [57]:
df_combined[df_combined["General Comments"].notna()]

Unnamed: 0,ProductID,FullComment,Warning,Unit Cell Data Source,General Comments,Reason O Quality Was Assigned,Deleted Or Rejected By,Color,Melting Point,Water Loss,...,X,Phasons,Homologous Series,Calc,Preparation Reference,Procedure,Solution,Slurry,Kneading,Solution B
5,000010006,General Comments: A mixture. Reason O Quality ...,,Powder Diffraction,A mixture,O assigned because unindexed,,,,,...,,,,,,,,,,
11,000010012,Color: White. General Comments: Decomposes at ...,Unindexed pattern,Powder Diffraction,Decomposes at 200°,,,White,,Loses 3 H2 O at 383 K,...,,,,,,,,,,
32,000010033,General Comments: A mixture. Warning: Unindexe...,Unindexed pattern,Powder Diffraction,A mixture,,,,,,...,,,,,,,,,,
56,000010057,Additional Patterns: See PDF 00-059-1260. Colo...,Lines with abs(delta 2Theta)>0.06 DEG,Powder Diffraction,Decomposes,,,Colorless,,,...,,,,,,,,,,
70,000010071,General Comments: A mixture. Warning: Unindexe...,Unindexed pattern,Powder Diffraction,A mixture,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1104132,050104882,Color: colourless. General Comments: Reported ...,,Single Crystal,Reported GOF=1.029,,,colourless,,,...,,,,,,,,,,
1104133,050104883,Color: yellow. General Comments: Reported GOF=...,,Single Crystal,Reported GOF=1.047,,,yellow,,,...,,,,,,,,,,
1104134,050104884,Color: brown. General Comments: Reported GOF=1...,,Single Crystal,Reported GOF=1.035,,,brown,,,...,,,,,,,,,,
1104135,050104885,Color: dark blue. General Comments: Reported G...,,Single Crystal,Reported GOF=1.036,,,dark blue,,,...,,,,,,,,,,
