In [None]:
#Yue Liu

In [2]:
#%pip install textblob
import pandas as pd
import re
#from textblob import TextBlob
#from spellchecker import SpellChecker

# Read the CSV file
df = pd.read_csv('patient_notes.csv')

# Case Conversion
df['pn_history'] = df['pn_history'].str.lower()

print(df)

       pn_num  case_num                                         pn_history
0           0         0  17-year-old male, has come to the student heal...
1           1         0  17 yo male with recurrent palpitations for the...
2           2         0  dillon cleveland is a 17 y.o. male patient wit...
3           3         0  a 17 yo m c/o palpitation started 3 mos ago; \...
4           4         0  17yo male with no pmh here for evaluation of p...
...       ...       ...                                                ...
42141   95330         9  ms. madden is a 20 yo female presenting w/ the...
42142   95331         9  a 20 yo f came complain a dull 8/10 headache t...
42143   95332         9  ms. madden is a 20yo female who presents with ...
42144   95333         9  stephanie madden is a 20 year old woman compla...
42145   95334         9  patient is a 20 yo f who presents with a heada...

[42146 rows x 3 columns]


In [7]:
# Removing Punctuation and Special Characters
#df['pn_history'] = df['pn_history'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)

print(df)

       pn_num  case_num                                         pn_history
0           0         0  17yearold male has come to the student health ...
1           1         0  17 yo male with recurrent palpitations for the...
2           2         0  dillon cleveland is a 17 yo male patient with ...
3           3         0  a 17 yo m co palpitation started 3 mos ago not...
4           4         0  17yo male with no pmh here for evaluation of p...
...       ...       ...                                                ...
42141   95330         9  ms madden is a 20 yo female presenting w the w...
42142   95331         9  a 20 yo f came complain a dull 810 headache th...
42143   95332         9  ms madden is a 20yo female who presents with a...
42144   95333         9  stephanie madden is a 20 year old woman compla...
42145   95334         9  patient is a 20 yo f who presents with a heada...

[42146 rows x 3 columns]


We tried the TextBlob. But after taking 17 minutes that the first 100 rows have not been handled, we plan to try some other ways.

Then I tried the symspellpy. However, it cannot be installed by conda, only can be dealt with by pip. So I move to SpaCy.

Then I found the pyspellchecker is something I must use with SpaCy and also not able to installed by conda. So I return to symspellpy.

In [20]:
#pip install symspellpy
from multiprocessing import Pool
import pandas as pd
import numpy as np
from symspellpy import SymSpell, Verbosity
import pkg_resources

# Initialize and load SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# Function to correct spelling in a sentence
def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

# Function to apply correct_spelling to a Series
def apply_correct_spelling(series):
    return series.apply(correct_spelling)

# Function to parallelize
def parallelize_dataframe(df, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df_combined = pd.concat(pool.map(apply_correct_spelling, df_split))
    pool.close()
    pool.join()
    return df_combined

# Applying the function in parallel
# df_head = df.head(1000)
if __name__ == 'main':
	df['pn_history'] = parallelize_dataframe(df['pn_history'])
print(df)


       pn_num  case_num                                         pn_history
0           0         0  17-year-old male, has come to the student heal...
1           1         0  17 yo male with recurrent palpitations for the...
2           2         0  dillon cleveland is a 17 y.o. male patient wit...
3           3         0  a 17 yo m c/o palpitation started 3 mos ago; \...
4           4         0  17yo male with no pmh here for evaluation of p...
...       ...       ...                                                ...
42141   95330         9  ms. madden is a 20 yo female presenting w/ the...
42142   95331         9  a 20 yo f came complain a dull 8/10 headache t...
42143   95332         9  ms. madden is a 20yo female who presents with ...
42144   95333         9  stephanie madden is a 20 year old woman compla...
42145   95334         9  patient is a 20 yo f who presents with a heada...

[42146 rows x 3 columns]
