In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [None]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
# appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')

In [None]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
# appointments_df.info()

# px Dataset

1. `pxid`: unique identifier assigned to each patient.
2. `age`: age of each patient at the time of record collection.
3. `gender`: gender of each patient.

In [None]:
px_df

### `pxid` data cleaning

In [None]:
# check for duplicate values
print("duplicate rows:", px_df['pxid'].duplicated().sum())

In [None]:
duplicated_rows = px_df[px_df.duplicated()]
print(duplicated_rows)

In [None]:
# drop duplicate rows
px_df = px_df.drop_duplicates()

In [None]:
px_df['pxid']

In [None]:
# check for missing values
print("duplicate rows:", px_df.duplicated().sum())

# check for missing values
print("no. of missing values: ", px_df['pxid'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['pxid'].apply(type).ne(str).sum())

### `age` data cleaning

In [None]:
px_df['age']

In [None]:
# check for missing values
print("no. of missing values: ", px_df['age'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['age'].apply(type).ne(str).sum())

# check for values with special characters
print("no. of values with special characters: ", px_df['age'].str.contains(r'[^0-9]').sum())

In [None]:
# filter out rows with empty values
mask = px_df['age'].notna()

# find rows with special characters
rows_with_special_chars = px_df[mask & px_df['age'].str.contains(r'[^0-9]')]

print("Values with special characters in 'age' column:")
print(rows_with_special_chars['age'])

In [None]:
# converting negative values to null values
px_df.loc[rows_with_special_chars.index, 'age'] = np.nan
print("Number of null values:", px_df['age'].isnull().sum())

In [None]:
# converting dtype to float64
px_df['age'] = pd.to_numeric(px_df['age'], errors='coerce')

# Convert the 'age' column to float64
px_df['age'] = px_df['age'].astype(float)
# Display the data type of the 'age' column after conversion
print("Data type after conversion:", px_df['age'].dtype)

In [None]:
px_df['age']

In [None]:
# check for non-float values
print("no. of non-float values: ", px_df['age'].apply(type).ne(float).sum())

In [None]:
# Verify that negative values are converted to empty strings
print("Number of negative values after conversion:", (px_df['age'] < 0).sum())

### `gender` data cleaning

In [None]:
px_df['gender']

In [None]:
# check for missing values
print("no. of missing values: ", px_df['gender'].isnull().sum())

# check for non-string values
print("no. of non-string values: ", px_df['gender'].apply(type).ne(str).sum())

In [None]:
# check for unique values
print(px_df['gender'].unique())

In [None]:
print(px_df[px_df['gender'] == 'gender'])

In [None]:
# remove the duplicate title row from the dataframe
px_df = px_df[px_df['gender'] != 'gender']

# check for unique values
print(px_df['gender'].unique())

In [None]:
px_df.info()

In [None]:
px_df