In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('medico data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())
print(df.info())

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Dataset shape: (627, 4)
Columns: ['Gender', 'Age Group', 'Complaint', 'Diagnosis']
  Gender Age Group                Complaint  \
0      M     10•19          Headache, Fever   
1      F       20>                Toothache   
2      M     10•19  Painful Swelling, Fever   
3      M     10•19                    Fever   
4      F       20>      Headache, Body Pain   

                                    Diagnosis  
0                                     Malaria  
1                               Dental Cavity  
2  Malaria, Upper Respiratory Tract Infection  
3                                     Malaria  
4                                     Malaria  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gender     626 non-null    object
 1   Age Group  625 non-null    object
 2   Complaint  627 non-null    object
 3   Diagnosis  627 non-null    object
dtypes: 

In [2]:
print("\nDataset Overview:")
print(f"Total records: {len(df)}")
print(f"Unique complaints: {df['Complaint'].nunique()}")
print(f"Unique diagnoses: {df['Diagnosis'].nunique()}")
print(f"Gender distribution:\n{df['Gender'].value_counts()}")
#print(f"Age group distribution:\n{df['Age Group'].value_counts()}")


Dataset Overview:
Total records: 627
Unique complaints: 89
Unique diagnoses: 67
Gender distribution:
Gender
M    373
F    253
Name: count, dtype: int64


## Data Cleaning

In [3]:
df = df.dropna()

In [4]:
df.drop('Age Group',axis = 1, inplace=True)

In [5]:
multiple_diagnoses = df[df['Diagnosis'].str.contains(',', na=False)]
print(f"Entries with multiple diagnoses: {len(multiple_diagnoses)}")


df['Primary_Diagnosis'] = df['Diagnosis'].str.split(',').str[0].str.strip()
df['Gender'] = df['Gender'].fillna('Unknown')

Entries with multiple diagnoses: 62


## Text Preprocessing

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters but keep spaces and commas
    text = re.sub(r'[^a-zA-Z\s,]', '', text)
    
    # Split by comma to handle multiple symptoms
    symptoms = [symptom.strip() for symptom in text.split(',')]
    
    processed_symptoms = []
    for symptom in symptoms:
        if symptom:  # Skip empty symptoms
            # Tokenize
            tokens = word_tokenize(symptom)
            
            # Remove common stopwords but keep medical-relevant ones
            medical_stopwords = set(stopwords.words('english')) - {
                'pain', 'ache', 'swelling', 'fever', 'headache', 'weakness'
            }
            tokens = [word for word in tokens if word not in medical_stopwords]
            
            # Light stemming for medical terms
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(word) for word in tokens]
            
            processed_symptoms.append(' '.join(tokens))
    
    return ' '.join(processed_symptoms)

# Apply preprocessing to complaint column
df['cleaned_symptoms'] = df['Complaint'].apply(preprocess_text)

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of primary diagnoses
plt.figure(figsize=(15, 8))
top_diagnoses = df['Primary_Diagnosis'].value_counts().head(20)
top_diagnoses.plot(kind='bar')
plt.title('Top 20 Most Common Primary Diagnoses')
plt.xlabel('Diagnosis')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Based on your data, Malaria is the most common diagnosis
print(f"Most common diagnosis: {df['Primary_Diagnosis'].value_counts().index[0]}")
print(f"Occurs in: {df['Primary_Diagnosis'].value_counts().iloc[0]} cases")

# Distribution by age group and gender
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Age group distribution
df['Age Group'].value_counts().plot(kind='pie', ax=ax1, autopct='%1.1f%%')
ax1.set_title('Distribution by Age Group')

# Gender distribution
df['Gender'].value_counts().plot(kind='pie', ax=ax2, autopct='%1.1f%%')
ax2.set_title('Distribution by Gender')

plt.tight_layout()
plt.show()

# Word cloud of symptoms
from wordcloud import WordCloud
all_symptoms = ' '.join(df['Complaint'].dropna().astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_symptoms)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Symptoms Word Cloud')
plt.show()

# Symptom-Diagnosis correlation analysis
print("\nTop symptom-diagnosis combinations:")
symptom_diagnosis = df.groupby(['Complaint', 'Primary_Diagnosis']).size().reset_index(name='count')
print(symptom_diagnosis.sort_values('count', ascending=False).head(10))

ImportError: cannot import name 'int' from 'numpy' (C:\Users\Oluwafolabomi Zion\anaconda3\lib\site-packages\numpy\__init__.py)

In [22]:
!pip install seaborn

