In [1]:
# =========================================
# Exploratory Data Analysis (EDA) بالعربي
# =========================================

# ===== 1️⃣ استيراد المكتبات =====
import pandas as pd
import numpy as np

# ===== 2️⃣ قراءة البيانات =====
file_path = "processed_asthma.csv"  # الملف المعالج من كودك السابق
df = pd.read_csv(file_path)

# عرض أول 5 صفوف للتعرف على البيانات
df.head()


Unnamed: 0,Patient_ID,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma,Asthma_Control_Level
0,ASTH100000,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0,
1,ASTH100001,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0,
2,ASTH100002,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0,
3,ASTH100003,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1,Poorly Controlled
4,ASTH100004,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0,


In [2]:
# معلومات عن البيانات
print(df.info())

# الإحصاءات الأساسية للبيانات العددية
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Patient_ID               10000 non-null  object 
 1   Age                      10000 non-null  int64  
 2   Gender                   10000 non-null  object 
 3   BMI                      10000 non-null  float64
 4   Smoking_Status           10000 non-null  object 
 5   Family_History           10000 non-null  int64  
 6   Allergies                7064 non-null   object 
 7   Air_Pollution_Level      10000 non-null  object 
 8   Physical_Activity_Level  10000 non-null  object 
 9   Occupation_Type          10000 non-null  object 
 10  Comorbidities            5033 non-null   object 
 11  Medication_Adherence     10000 non-null  float64
 12  Number_of_ER_Visits      10000 non-null  int64  
 13  Peak_Expiratory_Flow     10000 non-null  float64
 14  FeNO_Level             

Unnamed: 0,Age,BMI,Family_History,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,44.9307,25.05332,0.3034,0.497998,1.0159,400.88409,25.10142,0.2433
std,25.653559,4.874466,0.459749,0.224809,1.020564,97.531113,9.840184,0.429096
min,1.0,15.0,0.0,0.0,0.0,150.0,5.0,0.0
25%,23.0,21.6,0.0,0.32,0.0,334.8,18.2,0.0
50%,45.0,25.0,0.0,0.5,1.0,402.5,25.0,0.0
75%,67.0,28.4,1.0,0.67,2.0,468.7,31.7,0.0
max,89.0,45.0,1.0,0.99,6.0,600.0,63.9,1.0


In [3]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'القيم المفقودة': missing,
    'النسبة %': missing_percent
})
missing_df


Unnamed: 0,القيم المفقودة,النسبة %
Patient_ID,0,0.0
Age,0,0.0
Gender,0,0.0
BMI,0,0.0
Smoking_Status,0,0.0
Family_History,0,0.0
Allergies,2936,29.36
Air_Pollution_Level,0,0.0
Physical_Activity_Level,0,0.0
Occupation_Type,0,0.0


In [4]:
# تحديد الأعمدة العددية
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# حساب IQR والكشف عن القيم الشاذة
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
outliers_percent = (outliers / len(df)) * 100

outliers_df = pd.DataFrame({
    'القيم الشاذة': outliers,
    'النسبة %': outliers_percent
})
outliers_df


Unnamed: 0,القيم الشاذة,النسبة %
Age,0,0.0
BMI,24,0.24
Family_History,0,0.0
Medication_Adherence,0,0.0
Number_of_ER_Visits,3,0.03
Peak_Expiratory_Flow,0,0.0
FeNO_Level,41,0.41
Has_Asthma,2433,24.33


In [5]:
metrics = pd.DataFrame({
    'العمود': df.columns,
    'نسبة القيم المفقودة %': df.isnull().sum() / len(df) * 100,
    'نسبة القيم الشاذة %': [outliers_percent[col] if col in outliers_percent else 0 for col in df.columns]
})
metrics


Unnamed: 0,العمود,نسبة القيم المفقودة %,نسبة القيم الشاذة %
Patient_ID,Patient_ID,0.0,0.0
Age,Age,0.0,0.0
Gender,Gender,0.0,0.0
BMI,BMI,0.0,0.24
Smoking_Status,Smoking_Status,0.0,0.0
Family_History,Family_History,0.0,0.0
Allergies,Allergies,29.36,0.0
Air_Pollution_Level,Air_Pollution_Level,0.0,0.0
Physical_Activity_Level,Physical_Activity_Level,0.0,0.0
Occupation_Type,Occupation_Type,0.0,0.0
