- Version: v0.1
- Short description: This notebook is used to prepare and clean the donor information data.

## Install and Import

In [23]:
# Install missing modules
#!pip install watermark
#!pip install icecream
#!pip install loguru

In [24]:
# Import modules
import time
import warnings
from datetime import date

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from loguru import logger


In [25]:
# Google Colab handling
try:
    from google.colab import drive

    drive.mount("/content/drive")
    google_env = True
except ModuleNotFoundError:
    google_env = False
    print("Not a Google Drive Environment.")

Not a Google Drive Environment.


## Notebook Details

In [26]:
# parameters for logging and notebook exports
notebook_name = "01.02_clean_group_weights"  # only file name without extension

## Configuration

In [27]:
# Plotting
plt.rcParams["figure.figsize"] = (12, 8)
sns.set(rc={"figure.figsize": (12, 8)}, font_scale=0.8)

# Pandas
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# Warnings
warnings.filterwarnings("ignore")

In [28]:
# Constants
ROOT_PATH = (
    "/content/drive/MyDrive/MADS/SIADS699_Capstone_Project/00_Environment/"
    if google_env
    else "../"
)
PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_LOG = "logs/"
PATH_MOD = "models/"
PATH_REP = "reports/"
PATH_FIG = "reports/figures/"
PATH_HTML = "reports/html/"

# cpu_count = os.cpu_count()
# cpu_count = cpu_count - 2  # to keep machine responsive when fitting the models

seed = 42

In [29]:
# Logging

logger.add(f"{ROOT_PATH}{PATH_LOG}{notebook_name}.log", level="INFO")

# Examples
# logger.debug("This is a debug message")
# logger.info("This is an info message")
# logger.warning("This is a warning message")
# logger.error("This is an error message")

2

## Helper Functions

In [30]:
def sample_dataframe(df, fraction):
    """Sample a fraction of the dataframe."""
    sampled_df = df.sample(frac=fraction, random_state=seed).reset_index(drop=True)
    return sampled_df


def calculate_duration(start_time, end_time):
    """Calculate run time in minutes."""
    total_runtime = end_time - start_time
    return round((total_runtime / 60), 2)


def load_data(path):
    return pd.read_parquet(ROOT_PATH + path)

## Load Data

In [31]:
logger.info("Start.")
t_start = time.time()

[32m2023-11-04 19:50:49.767[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStart.[0m


In [32]:
# Load data
file = "group_weights.csv"
df = pd.read_csv(ROOT_PATH + PATH_DATA_RAW + file)

## Main Part

### Checking df basics

In [33]:
df.head()

Unnamed: 0,Donor ID,Weight
0,H14.09.001,7.84
1,H14.09.002,20.6
2,H14.09.003,7.84
3,H14.09.004,35.33
4,H14.09.005,8.81


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Donor ID  107 non-null    object 
 1   Weight    107 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.8+ KB


In [35]:
df.describe()

Unnamed: 0,Weight
count,107.0
mean,37.108692
std,38.707393
min,3.87
25%,10.31
50%,23.21
75%,51.35
max,232.77


In [36]:
df.describe(include="object")

Unnamed: 0,Donor ID
count,107
unique,107
top,H14.09.001
freq,1


### Missing Values

In [37]:
# Checking for missing values
df.isna().sum()

Donor ID    0
Weight      0
dtype: int64

## Rename columns

In [38]:

# Rename to be consistent with donor information dataset
df.rename(columns={"Donor ID": "name", "Weight": "group_weight"}, inplace=True)

df.head()

Unnamed: 0,name,group_weight
0,H14.09.001,7.84
1,H14.09.002,20.6
2,H14.09.003,7.84
3,H14.09.004,35.33
4,H14.09.005,8.81


## Exports

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          107 non-null    object 
 1   group_weight  107 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.8+ KB


In [40]:
# Export
file = "group_weights_cleaned"
df.to_csv(ROOT_PATH + PATH_DATA_INT + f"{file}.csv")
df.to_parquet(ROOT_PATH + PATH_DATA_INT + f"{file}.parquet")

## Watermark

In [41]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [42]:
%watermark

Last updated: 2023-11-04T19:50:49.817676+01:00

Python implementation: CPython
Python version       : 3.9.16
IPython version      : 8.16.1

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 23.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit



In [43]:
%watermark --iversions

seaborn   : 0.13.0
matplotlib: 3.8.0
pandas    : 2.1.1



-----


## Snapshot

In [44]:
today = date.today()
output_file = f"{ROOT_PATH}{PATH_HTML}{today}_{notebook_name}.html"
input_file = f"{notebook_name}.ipynb"
print(input_file)
!jupyter nbconvert --to html {input_file} --output {output_file}

01.02_clean_group_weights.ipynb


[NbConvertApp] Converting notebook 01.02_clean_group_weights.ipynb to html
[NbConvertApp] ERROR | Notebook JSON is invalid: Additional properties are not allowed ('id' was unexpected)

Failed validating 'additionalProperties' in code_cell:

On instance['cells'][3]:
{'cell_type': 'code',
 'execution_count': 2,
 'id': '0c142c95',
 'metadata': {},
 'outputs': ['...0 outputs...'],
 'source': '# Import modules\n'
           'import time\n'
           'from datetime impor...'}
[NbConvertApp] Writing 309202 bytes to ../reports/html/2023-11-04_01.02_clean_group_weights.html
