In [1]:
import pandas as pd
import numpy as np

In [5]:
import spacy
import spacy.lang.en.stop_words as stop_words

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
nlp = spacy.load('en_core_web_sm')

In [2]:
df = pd.read_csv('CLEANDATA/Clean_MT.csv', index_col=False)
df.head(10)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
5,Morbid obesity. Laparoscopic antecolic anteg...,Bariatrics,Laparoscopic Gastric Bypass,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r..."
6,"Liposuction of the supraumbilical abdomen, re...",Bariatrics,Liposuction,"PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma..."
7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram..."
8,Suction-assisted lipectomy - lipodystrophy of...,Bariatrics,Lipectomy - Abdomen/Thighs,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a..."
9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,..."


In [10]:
# preprocess the data

def preprocess(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        tokens.append(token.lemma_.lower())
    
    return ' '.join(tokens)

In [11]:
preprocess(df['transcription'].iloc[0])

'subjective   23 year old white female present complaint allergy   allergy live seattle think bad   past try claritin zyrtec   work short time lose effectiveness   allegra   summer begin week ago   appear work   counter spray prescription nasal spray   asthma do require daily medication think flare ,medications medication currently ortho tri cyclen allegra ,allergies know medicine allergy ,objective:,vitals   weight 130 pound blood pressure 124/78.,heent   throat mildly erythematous exudate   nasal mucosa erythematous swollen   clear drainage see   tm clear ,neck   supple adenopathy ,lungs   clear ,assessment   allergic rhinitis ,plan:,1   try zyrtec instead allegra   option use loratadine   think prescription coverage cheap ,2   sample nasonex spray nostril give week   prescription write'

In [12]:
corpus = df['transcription'].apply(preprocess).tolist()

In [13]:
corpus[:5]

['subjective   23 year old white female present complaint allergy   allergy live seattle think bad   past try claritin zyrtec   work short time lose effectiveness   allegra   summer begin week ago   appear work   counter spray prescription nasal spray   asthma do require daily medication think flare ,medications medication currently ortho tri cyclen allegra ,allergies know medicine allergy ,objective:,vitals   weight 130 pound blood pressure 124/78.,heent   throat mildly erythematous exudate   nasal mucosa erythematous swollen   clear drainage see   tm clear ,neck   supple adenopathy ,lungs   clear ,assessment   allergic rhinitis ,plan:,1   try zyrtec instead allegra   option use loratadine   think prescription coverage cheap ,2   sample nasonex spray nostril give week   prescription write',
 'past medical history difficulty climb stair difficulty airline seat tie shoe public seating lifting object floor   exercise time week home cardio   difficulty walk block flight stair   difficulty

In [15]:
# Create the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first 5 rows of the TF-IDF DataFrame
tfidf_df.head()

Unnamed: 0,00,000,0000000,000units,001,004,00am,00pm,01,013,...,zuba,zumi,zung,zygoma,zygomatic,zymar,zyprexa,zyrtec,zyvox,µiu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207816,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from textblob import TextBlob

# Example text
text = "I love this product! It's amazing and works perfectly."

# Create a TextBlob object
blob = TextBlob(text)

# Get the sentiment
sentiment = blob.sentiment

print(f"Sentiment: {sentiment}")