In [1]:
import pandas as pd 
import numpy as np 
import os 

In [2]:
# Reading in CSV for cPRA values according to Medical Center and Organ
cPRA = pd.read_csv('../Resources/CPRA.csv')
cPRA.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,All Organs,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart,Lung,Heart / Lung,Intestine,Abdominal Wall,Head & Neck: Craniofacial,GU: Uterus,Upper Limb: Bilateral,Upper Limb: Unilateral,"Face, Scalp"
0,All Centers,All CPRA,,107888,91242,11896,897.0,1702,3515,983.0,48.0,212.0,3.0,2.0,6.0,3.0,3.0,1.0
1,,0,,56345,55446,0,550.0,993,0,0.0,0.0,0.0,2.0,2.0,3.0,2.0,0.0,1.0
2,,1-19,,11339,11181,0,68.0,197,0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
3,,20-79,,15992,15737,0,134.0,309,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,,80-97,,5627,5522,0,70.0,107,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Dropping columns of organs we are not including
dropped_columns= cPRA.drop(columns=['Unnamed: 2','Lung', 'Heart / Lung',
                   'Intestine', 'Abdominal Wall', 'Head & Neck: Craniofacial', 
                   'GU: Uterus', 'Upper Limb: Bilateral', 'Upper Limb: Unilateral', 'Face, Scalp'])
dropped_columns.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,All Organs,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart
0,All Centers,All CPRA,107888,91242,11896,897.0,1702,3515
1,,0,56345,55446,0,550.0,993,0
2,,1-19,11339,11181,0,68.0,197,0
3,,20-79,15992,15737,0,134.0,309,0
4,,80-97,5627,5522,0,70.0,107,0


In [4]:
# Renaming Unnamed columns
renamed_df = dropped_columns.rename(columns={'Unnamed: 0':'Med. Center','Unnamed: 1':'cPRA Range'})

renamed_df

Unnamed: 0,Med. Center,cPRA Range,All Organs,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart
0,All Centers,All CPRA,107888,91242,11896,897.0,1702,3515
1,,0,56345,55446,0,550.0,993,0
2,,1-19,11339,11181,0,68.0,197,0
3,,20-79,15992,15737,0,134.0,309,0
4,,80-97,5627,5522,0,70.0,107,0
...,...,...,...,...,...,...,...,...
1642,,1-19,10,10,0,0.0,0,0
1643,,20-79,10,10,0,0.0,0,0
1644,,80-97,1,1,0,0.0,0,0
1645,,98-100,1,1,0,0.0,0,0


In [5]:
# Filling in NaN values in Medical Center column and removing All Organs column
df = renamed_df.copy()
df = pd.DataFrame(df)
df['Med. Center'] = df['Med. Center'].fillna(method='ffill')

not_all_organs= df.drop(columns='All Organs', axis=1)
not_all_organs.head()

Unnamed: 0,Med. Center,cPRA Range,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart
0,All Centers,All CPRA,91242,11896,897.0,1702,3515
1,All Centers,0,55446,0,550.0,993,0
2,All Centers,1-19,11181,0,68.0,197,0
3,All Centers,20-79,15737,0,134.0,309,0
4,All Centers,80-97,5522,0,70.0,107,0


In [6]:
# Filtering out the All Center index 
not_all_centers = ['All Centers']

#everthing but all center data 
df = not_all_organs[not_all_organs['Med. Center'].isin(not_all_centers) == False]

df.head()

Unnamed: 0,Med. Center,cPRA Range,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart
7,ALCH-TX1 Children's of Alabama,All CPRA,15,2,0.0,0,6
8,ALCH-TX1 Children's of Alabama,0,9,0,0.0,0,0
9,ALCH-TX1 Children's of Alabama,20-79,1,0,0.0,0,0
10,ALCH-TX1 Children's of Alabama,98-100,5,0,0.0,0,0
11,ALCH-TX1 Children's of Alabama,,0,2,0.0,0,6


In [7]:
# Replacing 'All CPRA' string in cPRA Range column with 'Total' string.
df['cPRA Range']= df['cPRA Range'].str.replace('All CPRA','Total')
# Dropping null values in cPRA Range column
df = df[df['cPRA Range'].notna()]

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cPRA Range']= df['cPRA Range'].str.replace('All CPRA','Total')


Unnamed: 0,Med. Center,cPRA Range,Kidney,Liver,Pancreas,Kidney / Pancreas,Heart
7,ALCH-TX1 Children's of Alabama,Total,15,2,0.0,0,6
8,ALCH-TX1 Children's of Alabama,0,9,0,0.0,0,0
9,ALCH-TX1 Children's of Alabama,20-79,1,0,0.0,0,0
10,ALCH-TX1 Children's of Alabama,98-100,5,0,0.0,0,0
12,ALUA-TX1 University of Alabama Hospital,Total,1031,102,5.0,9,15


In [8]:
df.dtypes

Med. Center           object
cPRA Range            object
Kidney                object
Liver                 object
Pancreas             float64
Kidney / Pancreas     object
Heart                 object
dtype: object

In [9]:
# Changing floats to integers for easier parsing in downstream applications
df = df.astype({"cPRA Range": str, "Kidney": str,
                    'Med. Center': str, 'Liver':int,
                    'Pancreas': int, 'Kidney / Pancreas': int, 'Heart':int})
# df.dtypes


In [139]:
# Exporting DataFrame to a CSV
df.to_csv(r'clean_cPRA.csv')