In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# reading csv into dataframe
df = pd.read_excel('Model Database for Share (2).xlsx')

In [4]:
# Performing initial operations on columns
bp = (
    df
    .rename(columns={'Unnamed: 0': 'Subject'})
    .astype({'Subject': 'category'})
    .dropna(subset=['Date of Visit'])
    .drop(columns=['Last 4', 'DOB'])
    .assign(Subject=lambda x: x['Subject'].ffill())
)
# Identifying and dropping the NaN-named column
nan_column = bp.columns[bp.columns.isna()]
if len(nan_column) > 0:
    bp.drop(columns=nan_column, inplace=True)

In [5]:
# Remove extra spaces and lowercase all letters of the column headers
bp.columns = [str(col).strip().lower().replace(' ', '_') for col in bp.columns]

In [6]:
bp.rename(columns={'medications_(gluc,_bp,_lipid,_statin)': 'medications'}, inplace=True)

In [7]:
# Checking initial dtypes, totals, and NaNs
bp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 688 entries, 0 to 1168
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   subject           688 non-null    category      
 1   date_of_visit     688 non-null    datetime64[ns]
 2   age               653 non-null    float64       
 3   sex               459 non-null    object        
 4   wt                682 non-null    float64       
 5   sbp               675 non-null    float64       
 6   dbp               675 non-null    object        
 7   bmi               640 non-null    object        
 8   obese             471 non-null    object        
 9   pre-dm            464 non-null    object        
 10  dm_t2             464 non-null    object        
 11  a1c               525 non-null    object        
 12  trig              483 non-null    float64       
 13  hdl               483 non-null    object        
 14  total_chol        483 non-null

In [8]:
# Counting unique subjects
bp['subject'].nunique()

470

In [9]:
bp.head()

Unnamed: 0,subject,date_of_visit,age,sex,wt,sbp,dbp,bmi,obese,pre-dm,dm_t2,a1c,trig,hdl,total_chol,ldl,medications,cgm,able_to_adhere,improved_energy?,hungry?,waist
0,1.0,2023-03-27,123.0,Male,264.0,129.0,78.0,39.07,YES,NO,YES,6.4,187.0,54.0,113.0,22.1,"Atorvastatin, Losartan, Metformin,",,,,,
1,1.0,2023-06-08,123.0,,258.0,116.0,62.0,,,,,6.3,,,,,No med changes,,,,,
2,1.0,2023-10-17,,,,,,,,,,6.2,,,,,No med changes,,,,,
3,1.0,2023-11-14,123.0,,265.0,130.0,78.0,39.2,,,,,,,,,,,,,,
4,2.0,2023-03-28,123.0,Male,233.0,130.0,89.0,31.67,YES,NO,YES,6.7,63.0,49.0,146.0,84.6,"Lisinopril, alogliptin, empagliflozin, hctz/tr...",,,,,


In [10]:
# handling irrelavant data based on missing data or sample size
#bpd = bp.dropna(subset=['age','sex','able_to_adhere','improved_energy?','hungry?','waist'])
# Dropping specified columns from the DataFrame
bpp = bp.drop(columns=['age', 'sex', 'able_to_adhere', 'improved_energy?', 'hungry?', 'waist'])


In [11]:
bpp.to_csv('bp.csv', index=False, header=True)

In [12]:
bpp.head()

Unnamed: 0,subject,date_of_visit,wt,sbp,dbp,bmi,obese,pre-dm,dm_t2,a1c,trig,hdl,total_chol,ldl,medications,cgm
0,1.0,2023-03-27,264.0,129.0,78.0,39.07,YES,NO,YES,6.4,187.0,54.0,113.0,22.1,"Atorvastatin, Losartan, Metformin,",
1,1.0,2023-06-08,258.0,116.0,62.0,,,,,6.3,,,,,No med changes,
2,1.0,2023-10-17,,,,,,,,6.2,,,,,No med changes,
3,1.0,2023-11-14,265.0,130.0,78.0,39.2,,,,,,,,,,
4,2.0,2023-03-28,233.0,130.0,89.0,31.67,YES,NO,YES,6.7,63.0,49.0,146.0,84.6,"Lisinopril, alogliptin, empagliflozin, hctz/tr...",
