In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
data = pd.read_csv('unlabeled_healthcare_data.csv')
print(data.head())

                                    aid          authorProfileId  \
0  b686214b-2f6e-4818-bc3f-2fe8a9597a79      annette-a-77577a248   
1  ba295f35-ff33-4f94-bd3c-1307c6f123ba      zhen-zhang-7715644b   
2  0ccf95a0-ebf5-4dfb-96c9-e87aa964efbc   d-sc-ms-bitsch-0464645   
3  fb88c3cc-62ab-408f-8ea9-0fdd3f8a9f9c  hesham-sherif-0706ab154   
4  fb88c3cc-62ab-408f-8ea9-0fdd3f8a9f9c  hesham-sherif-0706ab154   

             name                                        authorTitle  \
0       AnnetteA.        Access Consultant | Advocating AI Solutions   
1       ZhenZhang  Director, Statistician Lead, Medical & Real-Wo...   
2  D.Sc. MSBITSCH  Standard. ORCID: 0000-0003-2035-3471. Doctor o...   
3    HeshamSherif    Certified Psychiatrist in 3 different countries   
4    HeshamSherif    Certified Psychiatrist in 3 different countries   

                                    pid  \
0  6a55b3be-2b50-41c1-8413-f0dffc9670be   
1  2db7a951-672a-46aa-8223-a3b2fd2b915c   
2  b1ce6d34-fe99-43e5-a433-0c

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   aid              784 non-null    object
 1   authorProfileId  784 non-null    object
 2   name             784 non-null    object
 3   authorTitle      784 non-null    object
 4   pid              784 non-null    object
 5   text             784 non-null    object
dtypes: object(6)
memory usage: 36.9+ KB


In [None]:
data.nunique()

Unnamed: 0,0
aid,724
authorProfileId,693
name,693
authorTitle,687
pid,784
text,764


In [None]:
data.isnull().sum()

Unnamed: 0,0
aid,0
authorProfileId,0
name,0
authorTitle,0
pid,0
text,0


In [None]:
data.duplicated().sum()

0

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+|[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
data['processed_text'] = data['text'].apply(preprocess_text)
print(data[['text', 'processed_text']].head())

In [None]:
!pip install transformers
!pip install sentence-transformers



In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("tarasophia/Bio_ClinicalBERT_medical")
# model = AutoModelForSequenceClassification.from_pretrained("tarasophia/Bio_ClinicalBERT_medical")

In [None]:
classifier = pipeline("zero-shot-classification", model="tarasophia/Bio_ClinicalBERT_medical")

candidate_labels = ["Healthcare Provider", "Patient", "Regulator", "Pharmaceutical Representative", "Payers and Insurers"]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [None]:
def classify_text(text, author_title):
    result = classifier(text, candidate_labels)
    predicted_label = result["labels"][0]
    return predicted_label

In [None]:
# data_subset = data.sample(n=100)
# data_subset["label"] = data_subset.apply(lambda row: classify_text(row["text"], row["authorTitle"]), axis=1)
data["label"] = data.apply(lambda row: classify_text(row["text"], row["authorTitle"]), axis=1)

In [None]:
data.to_csv('labeled_healthcare_data.csv', index=False)

In [None]:
# data_subset.head()
data.head()

Unnamed: 0,aid,authorProfileId,name,authorTitle,pid,text,label
0,b686214b-2f6e-4818-bc3f-2fe8a9597a79,annette-a-77577a248,AnnetteA.,Access Consultant | Advocating AI Solutions,6a55b3be-2b50-41c1-8413-f0dffc9670be,"""Cannabidiol treatment is associated with broa...",Regulator
1,ba295f35-ff33-4f94-bd3c-1307c6f123ba,zhen-zhang-7715644b,ZhenZhang,"Director, Statistician Lead, Medical & Real-Wo...",2db7a951-672a-46aa-8223-a3b2fd2b915c,Data on early treatment for Bipolar-I disorder...,Regulator
2,0ccf95a0-ebf5-4dfb-96c9-e87aa964efbc,d-sc-ms-bitsch-0464645,D.Sc. MSBITSCH,Standard. ORCID: 0000-0003-2035-3471. Doctor o...,b1ce6d34-fe99-43e5-a433-0c0c28f429b5,#y24d067.2•Antipsychotic drugs list: publicati...,Regulator
3,fb88c3cc-62ab-408f-8ea9-0fdd3f8a9f9c,hesham-sherif-0706ab154,HeshamSherif,Certified Psychiatrist in 3 different countries,a25953cb-ffdf-4463-989f-d1a1d35b3403,"Clozapine, risperidone, aripiprazole, and olan...",Healthcare Provider
4,fb88c3cc-62ab-408f-8ea9-0fdd3f8a9f9c,hesham-sherif-0706ab154,HeshamSherif,Certified Psychiatrist in 3 different countries,482bb668-a919-4e50-a32d-10fb4a15b1d4,The suggested recommendations include the foll...,Regulator
