In [1]:
# Question 1
import pandas as pd
import json


file_path = '/content/DataEngineeringQ2.json'
with open(file_path, 'r') as file:
    data = json.load(file)


df = pd.json_normalize(data)


patient_details = pd.json_normalize(data, sep='_').filter(like='patientDetails')


def calculate_missing_percentage(column):
    missing_count = patient_details[column].replace(['', ' '], pd.NA).isna().sum()
    total_count = len(patient_details)
    return round((missing_count / total_count) * 100, 2)


missing_first_name = calculate_missing_percentage('patientDetails_firstName')
missing_last_name = calculate_missing_percentage('patientDetails_lastName')
missing_dob = calculate_missing_percentage('patientDetails_birthDate')


print(f"{missing_first_name}, {missing_last_name}, {missing_dob}")


0.0, 70.97, 32.26


In [2]:
#Question 2
mode_gender = patient_details['patientDetails_gender'].mode()[0]
patient_details['patientDetails_gender'].replace('', mode_gender, inplace=True)


female_percentage = round((patient_details['patientDetails_gender'].value_counts().get('F', 0) / len(patient_details)) * 100, 2)


print(f"{female_percentage}")


32.26


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  patient_details['patientDetails_gender'].replace('', mode_gender, inplace=True)


In [3]:
#Question 3
from datetime import datetime

current_year = datetime.now().year


patient_details['patientDetails_birthDate'] = pd.to_datetime(patient_details['patientDetails_birthDate'], errors='coerce')


patient_details['age'] = patient_details['patientDetails_birthDate'].apply(lambda dob: current_year - dob.year if pd.notna(dob) else None)


def categorize_age(age):
    if age is None:
        return None
    elif age <= 12:
        return 'Child'
    elif 13 <= age <= 19:
        return 'Teen'
    elif 20 <= age <= 59:
        return 'Adult'
    else:
        return 'Senior'

patient_details['ageGroup'] = patient_details['age'].apply(categorize_age)


adult_count = patient_details['ageGroup'].value_counts().get('Adult', 0)


print(adult_count)


21


In [4]:
#Question 4
average_medicines = round(df['consultationData.medicines'].apply(len).mean(), 2)


print(average_medicines)


2.13


In [5]:
#Question 5
from collections import Counter


all_medicines = [med['medicineName'] for medicines in df['consultationData.medicines'] for med in medicines]


medicine_counts = Counter(all_medicines)
third_most_frequent = [med for med, count in medicine_counts.most_common(3)][2]


print(third_most_frequent)


C


In [6]:
# Question 6
active_count = sum(1 for medicines in df['consultationData.medicines'] for med in medicines if med['isActive'])
inactive_count = sum(1 for medicines in df['consultationData.medicines'] for med in medicines if not med['isActive'])
total_medicines = active_count + inactive_count


active_percentage = round((active_count / total_medicines) * 100, 2)
inactive_percentage = round((inactive_count / total_medicines) * 100, 2)


print(f"{active_percentage}, {inactive_percentage}")


69.7, 30.3


In [7]:
# Question 7
def is_valid_indian_phone_number(number):
    number = str(number).strip()
    if number.startswith('+91'):
        number = number[3:]
    elif number.startswith('91'):
        number = number[2:]
    return len(number) == 10 and number.isdigit() and number[0] in '6789'


df['isValidMobile'] = df['phoneNumber'].apply(is_valid_indian_phone_number)


valid_phone_count = df['isValidMobile'].sum()


print(valid_phone_count)


18


In [8]:
# Question 8
df['numMedicines'] = df['consultationData.medicines'].apply(len)


df['age'] = patient_details['age']


valid_rows = df[['numMedicines', 'age']].dropna()


pearson_correlation = round(valid_rows['numMedicines'].corr(valid_rows['age']), 2)


print(pearson_correlation)


-0.21
