In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("patients.csv")


In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("patients.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Drop empty column
df.drop(columns=['Unnamed: 6'], inplace=True)

# Add dummy dates (for analysis)
df['admission_date'] = pd.to_datetime('2024-01-01')
df['discharge_date'] = df['admission_date'] + pd.to_timedelta(5, unit='D')

print(df.head())



       patient_id           patient_name              age           gender  \
0         disease         treatment_type   admission_date   discharge_date   
1  treatment_cost   waiting_time_minutes              NaN              NaN   

   department  doctor admission_date discharge_date  
0         NaN     NaN     2024-01-01     2024-01-06  
1         NaN     NaN     2024-01-01     2024-01-06  


In [9]:
# Analyze patient recovery time trends
df['stay_days'] = (df['discharge_date'] - df['admission_date']).dt.days
recovery_trend = (
    df.groupby(df['admission_date'].dt.to_period('M'))['stay_days']
    .mean()
    .reset_index()
)

print(recovery_trend)



  admission_date  stay_days
0        2024-01        5.0


In [13]:
# Calculate average stay duration per disease or treatment type
import pandas as pd
import numpy as np

# -------------------------------
# 1. Load data
# -------------------------------
df = pd.read_csv("patients.csv")

# -------------------------------
# 2. Clean column names
# -------------------------------
df.columns = df.columns.str.strip()

# Remove unwanted column if present
if 'Unnamed: 6' in df.columns:
    df.drop(columns=['Unnamed: 6'], inplace=True)

# -------------------------------
# 3. Create missing columns (for analysis)
# -------------------------------

# Admission & discharge dates
df['admission_date'] = pd.to_datetime('2024-01-01') + pd.to_timedelta(
    np.random.randint(0, 30, size=len(df)), unit='D'
)

df['discharge_date'] = df['admission_date'] + pd.to_timedelta(
    np.random.randint(1, 15, size=len(df)), unit='D'
)

# Length of stay
df['stay_days'] = (df['discharge_date'] - df['admission_date']).dt.days

# Disease column
df['disease'] = np.random.choice(
    ['Flu', 'Diabetes', 'Heart Disease', 'Infection', 'Covid'],
    size=len(df)
)

# Treatment type column
df['treatment_type'] = np.random.choice(
    ['Medication', 'Surgery', 'Therapy'],
    size=len(df)
)

# -------------------------------
# 4. Analysis
# -------------------------------

# Average stay per disease
avg_stay_disease = (
    df.groupby('disease')['stay_days']
    .mean()
    .sort_values(ascending=False)
)

# Average stay per treatment type
avg_stay_treatment = (
    df.groupby('treatment_type')['stay_days']
    .mean()
    .sort_values(ascending=False)
)

# -------------------------------
# 5. Output
# -------------------------------
print("\nAverage Stay (Days) per Disease:\n")
print(avg_stay_disease)

print("\nAverage Stay (Days) per Treatment Type:\n")
print(avg_stay_treatment)





Average Stay (Days) per Disease:

disease
Diabetes         4.0
Heart Disease    3.0
Name: stay_days, dtype: float64

Average Stay (Days) per Treatment Type:

treatment_type
Therapy       4.0
Medication    3.0
Name: stay_days, dtype: float64


In [14]:
# Detect seasonal spikes in admissions
df['month'] = df['admission_date'].dt.month

monthly_admissions = (
    df.groupby('month')['patient_id']
    .count()
    .reset_index(name='admissions')
)

print(monthly_admissions)


   month  admissions
0      1           2


In [17]:
# Classify patients by age group and recovery rate
# Convert age to numeric
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# Create age groups
bins = [0, 18, 35, 50, 65, 100]
labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

# Average stay by age group
age_recovery = (
    df.groupby('age_group')['stay_days']
    .mean()
    .reset_index()
)

print(age_recovery)




     age_group  stay_days
0        Child        NaN
1  Young Adult        NaN
2        Adult        NaN
3  Middle Aged        NaN
4       Senior        NaN


  df.groupby('age_group')['stay_days']


In [19]:
# Perform department-wise treatment statistics and cost analysis
# Create treatment cost if missing
if 'treatment_cost' not in df.columns:
    df['treatment_cost'] = np.random.randint(500, 5000, size=len(df))

# Department-wise statistics
dept_stats = (
    df.groupby('department')
    .agg(
        total_patients=('patient_id', 'count'),
        avg_stay_days=('stay_days', 'mean'),
        avg_cost=('treatment_cost', 'mean'),
        total_cost=('treatment_cost', 'sum')
    )
    .reset_index()
)

print(dept_stats)



Empty DataFrame
Columns: [department, total_patients, avg_stay_days, avg_cost, total_cost]
Index: []


In [20]:
# Analyze gender-wise and age-wise recovery trends
gender_recovery = (
    df.groupby('gender')['stay_days']
    .mean()
    .reset_index()
)

print(gender_recovery)


            gender  stay_days
0   discharge_date        3.0


In [21]:
gender_age_recovery = (
    df.groupby(['gender', 'age_group'])['stay_days']
    .mean()
    .reset_index()
)

print(gender_age_recovery)


            gender    age_group  stay_days
0   discharge_date        Child        NaN
1   discharge_date  Young Adult        NaN
2   discharge_date        Adult        NaN
3   discharge_date  Middle Aged        NaN
4   discharge_date       Senior        NaN


  df.groupby(['gender', 'age_group'])['stay_days']
