In [1]:
import pandas as pd
import os
import nltk
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/My Drive/gpt-4.csv'
print(os.path.exists(file_path))

True


In [4]:
df = pd.read_csv(file_path, on_bad_lines='skip', nrows=3000)
df

Unnamed: 0,data,conversation
0,This 60-year-old male was hospitalized due to ...,"Doctor: Good morning, how are you feeling toda..."
1,A 39-year-old man was hospitalized due to an i...,"Doctor: Hello, how are you feeling today?\nPat..."
2,One week after a positive COVID-19 result this...,"Doctor: Hello, how are you feeling today?\nPat..."
3,This 69-year-old male was admitted to the ICU ...,"Doctor: Hello, how are you feeling today?\nPat..."
4,This 57-year-old male was admitted to the ICU ...,"Doctor: Good morning, how are you feeling toda..."
...,...,...
2995,A 43-year-old female with dyspnoea was transfe...,Doctor: Good afternoon. You were transferred t...
2996,A 63-year-old female presented in 2016 with pa...,"Doctor: Hello, how are you feeling today?\nPat..."
2997,"A 72 year old female, with no significant past...","Doctor: Good morning, how are you feeling toda..."
2998,A 32-year-old female at pregnancy week 22 was ...,"Doctor: Hello, how are you feeling today?\nPat..."


In [5]:
# Randomly sample 1000 rows for analysis
sampled_df = df.sample(n=1000, random_state=42)
sampled_df

Unnamed: 0,data,conversation
1801,The third case was a 64-year-old female who ad...,"Doctor: Good afternoon, how are you feeling to..."
1190,A 34-year-old woman presented with severe epis...,"Doctor: Good morning, how can I help you today..."
1817,"A 53-year-old man, who had non-exertional ches...","Doctor: Hello, Mr. Johnson. How are you feelin..."
251,"A 34 year old woman from the east of Morocco, ...","Doctor: Good afternoon, Mrs. X. I see here in ..."
2505,A 19-year-old male of Caucasian origin was adm...,"Doctor: Hello, how are you feeling today?\nPat..."
...,...,...
618,A ten-year-old neutered female Cavalier King C...,"Doctor: Hello, how are you today?\nPatient: I'..."
406,A 65-year-old man with continuous irritable co...,"Doctor: Good morning, Mr. Johnson. How are you..."
1157,A 55-year-old woman without relevant medical h...,"Doctor: Hi, how are you feeling today?\nPatien..."
1068,A 52-year-old man was first diagnosed with HIV...,"Doctor: Hi there, how are you feeling today?\n..."


In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Preprocessing function
lemmatizer = WordNetLemmatizer()

In [8]:
# Define a preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Lemmatization
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmas)

In [9]:
# Apply preprocessing to "data" and "conversation" columns
sampled_df['processed_data'] = sampled_df['data'].apply(preprocess_text)
sampled_df['processed_conversation'] = sampled_df['conversation'].apply(preprocess_text)

In [10]:
sampled_df

Unnamed: 0,data,conversation,processed_data,processed_conversation
1801,The third case was a 64-year-old female who ad...,"Doctor: Good afternoon, how are you feeling to...",the third case wa a 64-year-old female who adm...,"doctor : good afternoon , how are you feeling ..."
1190,A 34-year-old woman presented with severe epis...,"Doctor: Good morning, how can I help you today...",a 34-year-old woman presented with severe epis...,"doctor : good morning , how can i help you tod..."
1817,"A 53-year-old man, who had non-exertional ches...","Doctor: Hello, Mr. Johnson. How are you feelin...","a 53-year-old man , who had non-exertional che...","doctor : hello , mr. johnson . how are you fee..."
251,"A 34 year old woman from the east of Morocco, ...","Doctor: Good afternoon, Mrs. X. I see here in ...","a 34 year old woman from the east of morocco ,...","doctor : good afternoon , mrs. x. i see here i..."
2505,A 19-year-old male of Caucasian origin was adm...,"Doctor: Hello, how are you feeling today?\nPat...",a 19-year-old male of caucasian origin wa admi...,"doctor : hello , how are you feeling today ? p..."
...,...,...,...,...
618,A ten-year-old neutered female Cavalier King C...,"Doctor: Hello, how are you today?\nPatient: I'...",a ten-year-old neutered female cavalier king c...,"doctor : hello , how are you today ? patient :..."
406,A 65-year-old man with continuous irritable co...,"Doctor: Good morning, Mr. Johnson. How are you...",a 65-year-old man with continuous irritable co...,"doctor : good morning , mr. johnson . how are ..."
1157,A 55-year-old woman without relevant medical h...,"Doctor: Hi, how are you feeling today?\nPatien...",a 55-year-old woman without relevant medical h...,"doctor : hi , how are you feeling today ? pati..."
1068,A 52-year-old man was first diagnosed with HIV...,"Doctor: Hi there, how are you feeling today?\n...",a 52-year-old man wa first diagnosed with hiv ...,"doctor : hi there , how are you feeling today ..."


In [11]:
# Problem Statement: Generate concise medical summaries from lengthy doctor-patient conversations/patient data for faster review.
# Why: Concise medical summaries streamline documentation, saving time and improving efficiency for healthcare providers. They allow quick review of key information, aiding faster decision-making and better patient care.

In [12]:
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the medical text summarization model using a pre-trained model from Hugging Face

summarizer = pipeline("summarization", model="Falconsai/medical_summarization")

# This creates a text summarization pipeline that uses the "Falconsai/medical_summarization" model, which is fine-tuned for summarizing medical texts such as clinical notes, patient records, and research articles.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [13]:
def remove_phrases(text):
    # List of common phrases
    phrases = [
        r'\bhello\b', r'\bhi\b', r'\bhey\b', r'\bhow are you\b',
        r'\bgood morning\b', r'\bgood afternoon\b', r'\bgood evening\b',
        r'\bwhat\'s up\b', r'\bhowdy\b', r'\bgreetings\b',
        r'\bthank you\b', r'\bthanks\b', r'\byou\'re welcome\b',
        r'\bappreciate it\b', r'\bsorry\b', r'\bokay\b', r'\bum\b', r'\byeah\b', r'\bsalutations\b'
    ]

    # Remove greetings and polite phrases
    cleaned_text = re.sub('|'.join(phrases), '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra spaces

    return cleaned_text

In [14]:
# Keyword Extraction using TF-IDF
def extract_keywords(text):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    keywords = [word for word, score in zip(vectorizer.get_feature_names_out(), X.sum(axis=0).tolist()[0]) if score > 0]
    return keywords

In [15]:
# Function to generate an insightful summary for processed_data
def generate_summary_for_processed_data(text, summarizer):
    summary_data = summarizer(text, max_length=120, min_length=50, do_sample=False)
    return summary_data[0]['summary_text']

In [16]:
# Function to generate an insightful summary for processed_conversation
def generate_summary_for_processed_conversation(text, summarizer):
    summary_conversation = summarizer(text, max_length=350, min_length=150, do_sample=False)
    return summary_conversation[0]['summary_text']

In [17]:
# Organize extracted content into sections for processed_data
def create_medical_summary_for_processed_data(text, summarizer):

    keywords = extract_keywords(text)

    summary = generate_summary_for_processed_data(text, summarizer)

    # Structuring the summary
    summary_dict = {

        "Keywords": keywords,
        "Summary": summary

    }
    return summary_dict

In [18]:
# Organize extracted content into sections for processed_conversation
def create_medical_summary_for_processed_conversation(text, summarizer):

    keywords = extract_keywords(text)

    summary = generate_summary_for_processed_conversation(text, summarizer)

    # Structuring the summary
    summary_dict = {

        "Keywords": keywords,
        "Summary": summary

    }
    return summary_dict

In [24]:
# Summarization
# Process 5 rows in the processed_conversation
for index, row in sampled_df.head().iterrows():  # Iterating through 5 rows
    conversation_text = row['processed_conversation']  # Extract the processed_data text
    conversation_text = remove_phrases(conversation_text)  # Remove phrases
    summary = create_medical_summary_for_processed_data(conversation_text, summarizer)  # Generate summary for each row

    # Output the structured summary for the current row
    print(f"Summary for row {index}:")
    print(f"Keywords: {summary['Keywords']}")
    print(f"Summary: {summary['Summary']}")
    print("\n" + "-"*50 + "\n")

Summary for row 1801:
Keywords: ['10', '24', '36', 'able', 'actually', 'administered', 'administration', 'admission', 'admitted', 'alright', 'attack', 'based', 'bleeding', 'blockage', 'blood', 'brain', 'cause', 'cerebellar', 'chest', 'clot', 'complication', 'confirm', 'confirmed', 'consciousness', 'conservatively', 'control', 'coronary', 'day', 'determine', 'diagnosis', 'did', 'discharged', 'dissolve', 'doctor', 'doe', 'electrocardiogram', 'elevation', 'embolism', 'evaluation', 'explaining', 'extubate', 'feeling', 'going', 'great', 'having', 'heart', 'hemorrhage', 'hmm', 'hospital', 'hour', 'hypotension', 'imaging', 'infarction', 'intervention', 'intubate', 'issue', 'later', 'll', 'loss', 'massive', 'medical', 'myocardial', 'need', 'non', 'oh', 'open', 'pain', 'patient', 'pci', 'percutaneous', 'perform', 'persistent', 'possible', 'previous', 'procedure', 'pulmonary', 'rectal', 'remember', 'reteplase', 'run', 'showed', 'st', 'start', 'step', 'symptom', 'test', 'today', 'treated', 'treat

In [20]:
# Summarization
# Process 5 rows in the processed_data
for index, row in sampled_df.head().iterrows():  # Iterating through 5 rows
    data_text = row['processed_data']  # Extract the processed_data text
    data_text = remove_phrases(data_text)  # Remove phrases
    summary = create_medical_summary_for_processed_data(data_text, summarizer)  # Generate summary for each row

    # Output the structured summary for the current row
    print(f"Summary for row {index}:")
    print(f"Keywords: {summary['Keywords']}")
    print(f"Summary: {summary['Summary']}")
    print("\n" + "-"*50 + "\n")

Summary for row 1801:
Keywords: ['10', '24', '36', '64', 'administered', 'administration', 'admission', 'admitted', 'brain', 'case', 'cerebellar', 'chest', 'complication', 'confirmed', 'consciousness', 'conservatively', 'control', 'coronary', 'day', 'discharged', 'elevation', 'extubate', 'female', 'following', 'hemorrhage', 'hour', 'hypotension', 'imaging', 'infarction', 'intervention', 'intubated', 'loss', 'massive', 'myocardial', 'non', 'old', 'pain', 'patient', 'pci', 'percutaneous', 'persistent', 'pte', 'rectorrhagia', 'reteplase', 'showed', 'st', 'treated', 'underwent', 'wa', 'year']
Summary: the third case wa a 64-year-old female who admitted with chest pain and non-st elevation myocardial infarction ; during admission she underwent percutaneous coronary intervention . two day following pci , the patient had persistent chest pain with hypotension and loss of consciousness , and wa intubated . imaging confirmed massive pte and reteplase wa administered .

-------------------------