<a href="https://colab.research.google.com/github/aako-aakash/Machine-Learning-Project-/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
file_path = 'patient_data.csv'  # Update the path as needed
df = pd.read_csv(file_path)

# Display basic info and first few rows to understand the data
print("Dataset Information:")
print(df.info())
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Data preprocessing
# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Convert categorical variables to consistent format (strip spaces, capitalize)
for col in ['C', 'Age', 'History', 'Patient', 'TakeMedication', 'Severity',
            'BreathShortness', 'VisualChanges', 'NoseBleeding', 'Whendiagnoused',
            'ControlledDiet', 'Stages']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.upper()

# Map blood pressure ranges to numeric averages or categories if needed
# e.g. Systolic "111 - 120" to average 115.5 or keep categories for modeling
def bp_to_avg(bp_str):
    try:
        parts = bp_str.split('-')
        if len(parts) == 2:
            low = int(parts[0].strip())
            high = int(parts[1].strip())
            return (low + high) / 2
        else:
            # For ranges like "130+" or "100+", return the lower bound
            return int(bp_str.replace('+','').strip())
    except:
        return None

df['Systolic_Avg'] = df['Systolic'].apply(bp_to_avg)
df['Diastolic_Avg'] = df['Diastolic'].apply(bp_to_avg)

# Example: Convert Severity and Stages to categorical codes for ML
df['Severity_Code'] = df['Severity'].map({'MILD': 1, 'MODERATE': 2, 'SEVER': 3})
stage_mapping = {'NORMAL': 0, 'HYPERTENSION (STAGE-1)': 1, 'HYPERTENSION (STAGE-2)': 2, 'HYPERTENSIVE CRISIS': 3}
df['Stages_Code'] = df['Stages'].map(stage_mapping)

# Basic checks for missing data
print("\nMissing values in each column:")
print(df.isnull().sum())

# Save processed data for ML modeling
processed_file_path = 'processed_patient_data.csv'
df.to_csv(processed_file_path, index=False)

print(f"\nProcessed data saved to {processed_file_path}")

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 0 to 1824
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   C                1825 non-null   object
 1   Age              1825 non-null   object
 2   History          1825 non-null   object
 3   Patient          1825 non-null   object
 4   TakeMedication   1825 non-null   object
 5   Severity         1825 non-null   object
 6   BreathShortness  1825 non-null   object
 7   VisualChanges    1825 non-null   object
 8   NoseBleeding     1825 non-null   object
 9   Whendiagnoused   1825 non-null   object
 10  Systolic         1825 non-null   object
 11  Diastolic        1825 non-null   object
 12  ControlledDiet   1825 non-null   object
 13  Stages           1825 non-null   object
dtypes: object(14)
memory usage: 199.7+ KB
None

First 5 rows of the dataset:
        C    Age History Patient TakeMedication Severity BreathShortness  