In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample medical transcript
medical_transcript = """
Patient presented with symptoms of cough and shortness of breath.
Physical examination revealed elevated temperature and wheezing.
Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler.
"""

# Download stopwords list
nltk.download('stopwords')
nltk.download('punkt')

# Tokenize the medical transcript
tokens = word_tokenize(medical_transcript)

# Get list of stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokens
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Join the filtered tokens back into a string
filtered_transcript = ' '.join(filtered_tokens)

# Print the filtered transcript
print("Filtered Transcript:")
print(filtered_transcript)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siva7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siva7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filtered Transcript:
Patient presented symptoms cough shortness breath . Physical examination revealed elevated temperature wheezing . Diagnosis confirmed bronchitis , prescribed antibiotics inhaler .


In [2]:
from collections import Counter
from nltk.tokenize import word_tokenize

# Sample medical transcript
medical_transcript = """
Patient presented with symptoms of cough and shortness of breath.
Physical examination revealed elevated temperature and wheezing.
Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler.
"""

# Tokenize the medical transcript
tokens = word_tokenize(medical_transcript.lower())  # Convert to lowercase for case-insensitive counting

# Calculate term frequency using Counter
term_frequency = Counter(tokens)

# Print the term frequency
print("Term Frequency:")
for term, frequency in term_frequency.items():
    print(f"{term}: {frequency}")


Term Frequency:
patient: 1
presented: 1
with: 1
symptoms: 1
of: 2
cough: 1
and: 3
shortness: 1
breath: 1
.: 3
physical: 1
examination: 1
revealed: 1
elevated: 1
temperature: 1
wheezing: 1
diagnosis: 1
confirmed: 1
as: 1
bronchitis: 1
,: 1
prescribed: 1
antibiotics: 1
inhaler: 1


In [3]:
import math
from collections import Counter
from nltk.tokenize import word_tokenize

# Sample medical transcript
medical_transcript = """
Patient presented with symptoms of cough and shortness of breath.
Physical examination revealed elevated temperature and wheezing.
Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler.
"""

# Tokenize the medical transcript
tokens = word_tokenize(medical_transcript.lower())  # Convert to lowercase for case-insensitive counting

# Count the number of documents containing each term
document_frequency = Counter(set(tokens))

# Calculate IDF for each term
num_documents = len(medical_transcript.splitlines())  # Assuming each line is a separate document
inverse_document_frequency = {term: math.log(num_documents / (document_frequency[term] + 1))
                               for term in document_frequency}

# Print the IDF for each term
print("Inverse Document Frequency (IDF):")
for term, idf in inverse_document_frequency.items():
    print(f"{term}: {idf}")


Inverse Document Frequency (IDF):
of: 0.6931471805599453
bronchitis: 0.6931471805599453
,: 0.6931471805599453
.: 0.6931471805599453
wheezing: 0.6931471805599453
shortness: 0.6931471805599453
symptoms: 0.6931471805599453
diagnosis: 0.6931471805599453
breath: 0.6931471805599453
prescribed: 0.6931471805599453
revealed: 0.6931471805599453
as: 0.6931471805599453
elevated: 0.6931471805599453
temperature: 0.6931471805599453
physical: 0.6931471805599453
with: 0.6931471805599453
examination: 0.6931471805599453
antibiotics: 0.6931471805599453
presented: 0.6931471805599453
and: 0.6931471805599453
patient: 0.6931471805599453
inhaler: 0.6931471805599453
confirmed: 0.6931471805599453
cough: 0.6931471805599453


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the medical transcripts to compute TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(medical_transcripts)

# Get the vocabulary and IDF values
vocabulary = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_

# Print the IDF values for each term
print("Inverse Document Frequency (IDF):")
for term, idf in zip(vocabulary, idf_values):
    print(f"{term}: {idf}")


Inverse Document Frequency (IDF):
and: 1.0
antibiotics: 1.6931471805599454
as: 1.6931471805599454
breath: 1.6931471805599454
bronchitis: 1.6931471805599454
confirmed: 1.6931471805599454
cough: 1.6931471805599454
diagnosis: 1.6931471805599454
elevated: 1.6931471805599454
examination: 1.6931471805599454
inhaler: 1.6931471805599454
of: 1.6931471805599454
patient: 1.6931471805599454
physical: 1.6931471805599454
prescribed: 1.6931471805599454
presented: 1.6931471805599454
revealed: 1.6931471805599454
shortness: 1.6931471805599454
symptoms: 1.6931471805599454
temperature: 1.6931471805599454
wheezing: 1.6931471805599454
with: 1.6931471805599454


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the medical transcripts to compute TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(medical_transcripts)

# Get the vocabulary and IDF values
vocabulary = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Print the vocabulary and IDF values for each term
print("\nVocabulary and IDF values:")
for term, idf in zip(vocabulary, idf_values):
    print(f"{term}: {idf}")


TF-IDF Matrix:
[[0.17531933 0.         0.         0.29684142 0.         0.
  0.29684142 0.         0.         0.         0.         0.59368285
  0.29684142 0.         0.         0.29684142 0.         0.29684142
  0.29684142 0.         0.         0.29684142]
 [0.2344005  0.         0.         0.         0.         0.
  0.         0.         0.39687454 0.39687454 0.         0.
  0.         0.39687454 0.         0.         0.39687454 0.
  0.         0.39687454 0.39687454 0.        ]
 [0.21786941 0.36888498 0.36888498 0.         0.36888498 0.36888498
  0.         0.36888498 0.         0.         0.36888498 0.
  0.         0.         0.36888498 0.         0.         0.
  0.         0.         0.         0.        ]]

Vocabulary and IDF values:
and: 1.0
antibiotics: 1.6931471805599454
as: 1.6931471805599454
breath: 1.6931471805599454
bronchitis: 1.6931471805599454
confirmed: 1.6931471805599454
cough: 1.6931471805599454
diagnosis: 1.6931471805599454
elevated: 1.6931471805599454
examination: 1

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into a matrix of token counts
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
X = vectorizer.fit_transform(medical_transcripts)

# Calculate chi-square statistic and p-value for each feature
chi2_stat, p_values = chi2(X, [0, 1, 2])  # Assuming each transcript corresponds to a class label

# Print the chi-square statistic and p-value for each feature
print("Chi-Square Statistic and p-values:")
for feature, chi2_val, p_val in zip(vectorizer.get_feature_names_out(), chi2_stat, p_values):
    print(f"{feature}: chi2 = {chi2_val}, p-value = {p_val}")


Chi-Square Statistic and p-values:
and: chi2 = 0.0, p-value = 1.0
antibiotics: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
as: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
breath: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
bronchitis: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
confirmed: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
cough: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
diagnosis: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
elevated: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
examination: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
inhaler: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
of: chi2 = 4.000000000000001, p-value = 0.1353352832366126
patient: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
physical: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
prescribed: chi2 = 2.0000000000000004, p-value = 0.3678794411714423
presented: chi2 = 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into a matrix of token counts
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
X = vectorizer.fit_transform(medical_transcripts)

# Dummy labels for demonstration purpose (replace with actual labels)
y = [1, 0, 1]  # Example binary classification labels

# Calculate information gain for each feature
information_gain = mutual_info_classif(X, y)

# Print the information gain for each feature
print("Information Gain for each feature:")
for feature, gain in zip(vectorizer.get_feature_names_out(), information_gain):
    print(f"{feature}: {gain}")


Information Gain for each feature:
and: 0.0
antibiotics: 0.1744160479215161
as: 0.1744160479215161
breath: 0.1744160479215161
bronchitis: 0.1744160479215161
confirmed: 0.1744160479215161
cough: 0.1744160479215161
diagnosis: 0.1744160479215161
elevated: 0.6365141682948129
examination: 0.6365141682948129
inhaler: 0.1744160479215161
of: 0.1744160479215161
patient: 0.1744160479215161
physical: 0.6365141682948129
prescribed: 0.1744160479215161
presented: 0.1744160479215161
revealed: 0.6365141682948129
shortness: 0.1744160479215161
symptoms: 0.1744160479215161
temperature: 0.6365141682948129
wheezing: 0.6365141682948129
with: 0.1744160479215161


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into a matrix of token counts
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
X = vectorizer.fit_transform(medical_transcripts)

# Dummy labels for demonstration purpose (replace with actual labels)
y = [1, 0, 1]  # Example binary classification labels

# Calculate mutual information for each feature
mutual_information = mutual_info_classif(X, y)

# Print the mutual information for each feature
print("Mutual Information for each feature:")
for feature, mi in zip(vectorizer.get_feature_names_out(), mutual_information):
    print(f"{feature}: {mi}")


Mutual Information for each feature:
and: 0.0
antibiotics: 0.1744160479215161
as: 0.1744160479215161
breath: 0.1744160479215161
bronchitis: 0.1744160479215161
confirmed: 0.1744160479215161
cough: 0.1744160479215161
diagnosis: 0.1744160479215161
elevated: 0.6365141682948129
examination: 0.6365141682948129
inhaler: 0.1744160479215161
of: 0.1744160479215161
patient: 0.1744160479215161
physical: 0.6365141682948129
prescribed: 0.1744160479215161
presented: 0.1744160479215161
revealed: 0.6365141682948129
shortness: 0.1744160479215161
symptoms: 0.1744160479215161
temperature: 0.6365141682948129
wheezing: 0.6365141682948129
with: 0.1744160479215161


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import VarianceThreshold

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize CountVectorizer to convert text into a matrix of token counts
vectorizer = CountVectorizer()

# Fit and transform the medical transcripts
X = vectorizer.fit_transform(medical_transcripts)

# Convert the sparse matrix to a dense array
X_dense = X.toarray()

# Calculate variance of each feature
variances = X_dense.var(axis=0)

# Print the variances for each feature
print("Variance for each feature:")
for feature, variance in zip(vectorizer.get_feature_names_out(), variances):
    print(f"{feature}: {variance}")

# Apply variance thresholding
threshold = 0.01  # Example threshold value
selector = VarianceThreshold(threshold=threshold)
X_selected = selector.fit_transform(X_dense)

# Print the selected features
print("\nSelected features:")
selected_features = vectorizer.get_feature_names_out()[selector.get_support()]
print(selected_features)


Variance for each feature:
and: 0.0
antibiotics: 0.22222222222222224
as: 0.22222222222222224
breath: 0.22222222222222224
bronchitis: 0.22222222222222224
confirmed: 0.22222222222222224
cough: 0.22222222222222224
diagnosis: 0.22222222222222224
elevated: 0.22222222222222224
examination: 0.22222222222222224
inhaler: 0.22222222222222224
of: 0.888888888888889
patient: 0.22222222222222224
physical: 0.22222222222222224
prescribed: 0.22222222222222224
presented: 0.22222222222222224
revealed: 0.22222222222222224
shortness: 0.22222222222222224
symptoms: 0.22222222222222224
temperature: 0.22222222222222224
wheezing: 0.22222222222222224
with: 0.22222222222222224

Selected features:
['antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough' 'diagnosis'
 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical' 'prescribed'
 'presented' 'revealed' 'shortness' 'symptoms' 'temperature' 'wheezing'
 'with']


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize TF-IDF vectorizer to convert text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(medical_transcripts)

# Dummy labels for demonstration purpose (replace with actual labels)
y = [1, 0, 1]  # Example binary classification labels

# Apply correlation-based feature selection
num_features_to_select = 2  # Example number of features to select
selector = SelectKBest(score_func=f_classif, k=num_features_to_select)
X_selected = selector.fit_transform(X, y)

# Get the indices of selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of selected features
selected_feature_names = [vectorizer.get_feature_names_out()[i] for i in selected_feature_indices]

# Print the selected feature names
print("Selected features:")
print(selected_feature_names)


Selected features:
['temperature', 'wheezing']


  f = msb / msw


In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Example medical transcripts data
data = {
    'text': [
        "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia.",
        "Patient reported chest pain and shortness of breath. Diagnosis confirmed myocardial infarction.",
        "Patient experienced abdominal pain and nausea. Diagnosis indicated appendicitis."
    ],
    'label': [1, 0, 1]  # Example labels (1 for positive diagnosis, 0 for negative diagnosis)
}

# Convert data to DataFrame
df = pd.DataFrame(data)

# Create a CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Apply Lasso feature selection
lasso = Lasso(alpha=0.1)  # Set the regularization strength
lasso.fit(X_train, y_train)

# Get feature importance (coefficients) from the Lasso model
feature_importance = np.abs(lasso.coef_)

# Sort feature importance in descending order
sorted_indices = np.argsort(feature_importance)[::-1]

# Get the top 5 most important features
top_features = vectorizer.get_feature_names_out()[sorted_indices[:5]]

print("Top 5 most important features:")
for feature in top_features:
    print(feature)


Top 5 most important features:
abdominal
symptoms
and
appendicitis
breath


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Sample medical transcripts
medical_transcripts = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Dummy labels for demonstration purpose (replace with actual labels)
y = [1, 0, 1]  # Example binary classification labels

# Initialize TF-IDF vectorizer to convert text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(medical_transcripts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest.fit(X_train, y_train)

# Get feature importances
feature_importances = random_forest.feature_importances_

# Get the feature names from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Print the feature importances
print("Feature Importances:")
for feature, importance in zip(feature_names, feature_importances):
    print(f"{feature}: {importance}")


Feature Importances:
and: 0.038461538461538464
antibiotics: 0.057692307692307696
as: 0.15384615384615385
breath: 0.0
bronchitis: 0.057692307692307696
confirmed: 0.019230769230769232
cough: 0.0
diagnosis: 0.11538461538461539
elevated: 0.1346153846153846
examination: 0.07692307692307693
inhaler: 0.038461538461538464
of: 0.0
patient: 0.0
physical: 0.07692307692307693
prescribed: 0.11538461538461539
presented: 0.0
revealed: 0.057692307692307696
shortness: 0.0
symptoms: 0.0
temperature: 0.038461538461538464
wheezing: 0.019230769230769232
with: 0.0


In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector

# Example medical transcripts data
data = {
    'text': [
        "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia.",
        "Patient reported chest pain and shortness of breath. Diagnosis confirmed myocardial infarction.",
        "Patient experienced abdominal pain and nausea. Diagnosis indicated appendicitis.",
        "Patient complained of headaches and dizziness. Diagnosis suggested migraine.",
        "Patient had difficulty breathing and chest tightness. Diagnosis suspected asthma.",
        "Patient complained of joint pain and stiffness. Diagnosis suggested rheumatoid arthritis.",
        "Patient had severe abdominal cramps and diarrhea. Diagnosis revealed gastroenteritis.",
        "Patient experienced rapid heartbeat and chest discomfort. Diagnosis indicated panic attack.",
        "Patient presented with skin rash and itching. Diagnosis revealed allergic reaction.",
        "Patient complained of fatigue and weakness. Diagnosis suggested anemia."
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # Example labels (1 for positive diagnosis, 0 for negative diagnosis)
}

# Convert data to DataFrame
df = pd.DataFrame(data)

# Create a CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a logistic regression model
lr = LogisticRegression()

# Initialize Sequential Feature Selector
sfs = SequentialFeatureSelector(lr,
                                k_features='best',  # Select the best subset of features
                                forward=True,       # Forward selection (start from no features and add one by one)
                                scoring='accuracy', # Use accuracy as the evaluation metric
                                cv=3)               # Reduce number of cross-validation folds

# Perform Sequential Feature Selection
sfs.fit(X_train, y_train)

# Get selected feature indices
selected_features_indices = sfs.k_feature_idx_

# Get selected feature names
selected_feature_names = [vectorizer.get_feature_names_out()[i] for i in selected_features_indices]

print("Selected Features:", selected_feature_names)


Selected Features: ['abdominal', 'complained', 'had']
