In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('mimic.csv')


In [3]:
# Basic information about the dataset
print("Shape of the dataset:", df.shape)
print("\nData Types:\n", df.dtypes)


Shape of the dataset: (167034, 10)

Data Types:
 patient_id            int64
patient_uid          object
PMID                  int64
file_path            object
title                object
patient              object
age                  object
gender               object
relevant_articles    object
similar_patients     object
dtype: object


In [4]:

# Checking for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values)



Missing Values:
 patient_id           0
patient_uid          0
PMID                 0
file_path            0
title                0
patient              0
age                  0
gender               0
relevant_articles    0
similar_patients     0
dtype: int64


In [5]:
# Calculate the length of each text entry without adding columns
title_lengths = df['title'].apply(len)
question_lengths = df['patient'].apply(len)

# Create a DataFrame for descriptive statistics
lengths_df = pd.DataFrame({
    'Title_Length': title_lengths,
    'patient_info_Length': question_lengths,
})

# Descriptive statistics
descriptive_stats = lengths_df.describe()
descriptive_stats

Unnamed: 0,Title_Length,patient_info_Length
count,167034.0,167034.0
mean,96.153071,2765.333022
std,34.584621,1659.310912
min,7.0,55.0
25%,71.0,1645.0
50%,92.0,2491.0
75%,117.0,3529.0
max,377.0,95400.0


In [6]:
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean text
def data_preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to each relevant column
for column in ['title', 'patient']:
    df[column] = df[column].apply(data_preprocess)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrinivas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Display Pre-process data
print("Pre-process Data:")


Pre-process Data:


In [8]:
df[['title','patient','age','gender']].head(5)


Unnamed: 0,title,patient,age,gender
0,early physical therapist interventions for pat...,this yearold male was hospitalized due to mode...,"[[60.0, 'year']]",M
1,early physical therapist interventions for pat...,a yearold man was hospitalized due to an incre...,"[[39.0, 'year']]",M
2,early physical therapist interventions for pat...,one week after a positive covid result this ye...,"[[57.0, 'year']]",M
3,early physical therapist interventions for pat...,this yearold male was admitted to the icu afte...,"[[69.0, 'year']]",M
4,early physical therapist interventions for pat...,this yearold male was admitted to the icu with...,"[[57.0, 'year']]",M


In [9]:
df.to_csv('mimic_preprocess.csv', columns=['title','patient','age','gender'], index=False)