In [None]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 25

# Table of Contents

- [Summaries data](#Summaries-data)
- [Handle missing values](#Handle-missing-values)
- [Check for duplicates](#Check-for-duplicates)
- [Plot box plot to detect outliers](#Plot-box-plot-to-detect-outliers)
- [One-hot encode](#One-hot-encode)

# Summaries data

In [None]:
df = pd.read_csv('NCDataChallenge_2021_v1.csv', usecols=lambda column: column != 'Unnamed: 0')

# compute min, max, and mean for each numeric column
numeric_columns = df.select_dtypes(exclude=['object']).columns
min_values = df[numeric_columns].min()
max_values = df[numeric_columns].max()
mean_values = df[numeric_columns].mean()
median_values = df[numeric_columns].median()
mode_values = df[numeric_columns].mode()

# combine the statistics
summary_df = pd.DataFrame({
    'Min': min_values,
    'Max': max_values,
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values.iloc[0],
})

print(summary_df)

# Handle missing values

In [None]:
print('Missing values')
print(df.isnull().sum())
print()

# remove numerical missing values
df = df.dropna(subset = ['average_cycle_length', 'cycle_length_std', 'regular_cycle'])
# keep object missing values
df = df.fillna('Missing entry')

print('Missing values')
print(df.isnull().sum())
print()

# Check for duplicates

In [None]:
print('Duplicates')
print(df.duplicated().sum())
print()
#df.drop_duplicates(inplace=True)

# print types
print('Types')
print(df.dtypes)

# Plot box-plot to detect outliers

In [None]:
numeric_columns = df.select_dtypes(exclude=['object']).columns

for column_name in ['n_cycles_trying']: #numeric_columns:
    
    fig, ax = plt.subplots(figsize=(6, 10))
    
    ax.boxplot(
        df[column_name],
    )
    ax.set_ylabel(column_name)
    plt.yticks(rotation=90, ha='right')
    plt.show()

    # remove outliers based on a threshold
    #df = df[df[column_name] < df[column_name].quantile(0.99)]

In [None]:
# create a new column that groups former column entries according to the provided ranges
for column_name, edges in [
    ('bmi', [5, 18.5, 24.9, 29.9, 39.9, 50]),
    ('age', np.arange(20, 45, 5)),
    ('dedication', np.linspace(0, 1, 5)),
    ('average_cycle_length', np.arange(20, 60, 5)),
    ('cycle_length_std', np.arange(0, 30, 5)),
    ('intercourse_frequency', [0, 0.1, 0.2, 0.4, 1]),
]:

    df[column_name + '_group'] = pd.cut(df[column_name], bins=edges)

In [None]:
df.to_pickle("NC_cleaned.pkl")

# One-hot encode

In [None]:
# convert categorical variables into numeric format

object_columns = ['been_pregnant_before', 'regular_cycle', 'education', 'sleeping_pattern', 'country', 'outcome']

for column_name in object_columns:
    df[column_name] = df[column_name].astype('category')

df[object_columns] = df[object_columns].dropna().apply(lambda x: x.cat.codes)

In [None]:
df.to_pickle("NC_cleaned_encoded.pkl")