In [None]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 20

# Table of Contents

- [Q1: What is the chance of getting pregnant within 13 cycles?](#Compute-'Chance-of-pregnancy-within-13-cycles')
- [Q2: How long does it usually take to get pregnant?](#Compute-'How-long-does-it-usually-take-to-get-pregnant?')
- [Q3: What factors impact the time it takes to get pregnant?](#Compute-"What-factors-impact-the-time-it-takes-to-get-pregnant?")

# Compute 'Chance of pregnancy within 13 cycles'

In [None]:
df = pd.read_pickle('NC_cleaned.pkl')

column_trying = 'n_cycles_trying'

# create a dataframe of women that became pregnant within 13 cycles
filtered_df = df[(
    (df[column_trying] <= 13) & (df['outcome'] == 'pregnant')
)]
total_count = len(df[column_trying])
# compute chance
chance = 100 * len(filtered_df[column_trying]) / total_count

print(f"Chance is {chance:.2f} %")

In [None]:
# create a dataframe of women that became pregnant within 13 cycles
filtered_df = df[(
    (df[column_trying] <= 13)
    & (df['outcome'] == 'pregnant')
    & (df['dedication_group'] == pd.Interval(0.75, 1.0, closed='right'))
    & (df['intercourse_frequency_group'] == pd.Interval(0.4, 1.0, closed='right'))
)]
total_df = df[(
    (df['dedication_group'] == pd.Interval(0.75, 1.0, closed='right'))
     & (df['intercourse_frequency_group'] == pd.Interval(0.4, 1.0, closed='right'))
)]
total_count = len(total_df[column_trying])
# compute chance
chance = 100 * len(filtered_df[column_trying]) / total_count

print(f"Chance when app dedication: logged >75% and intercourse frequency: logged >40% is {chance:.2f} %")

In [None]:
# create a dataframe of women that became pregnant within 13 cycles
filtered_df = df[(
    (df[column_trying] <= 13)
    & (df['outcome'] == 'pregnant')
    & (df['dedication_group'] == pd.Interval(0.0, 0.25, closed='right'))
    & (df['intercourse_frequency_group'] == pd.Interval(0.0, 0.1, closed='right'))
)]
total_df = df[(
    (df['dedication_group'] == pd.Interval(0.0, 0.25, closed='right'))
     & (df['intercourse_frequency_group'] == pd.Interval(0.0, 0.1, closed='right'))
)]
total_count = len(total_df[column_trying])
# compute chance
chance = 100 * len(filtered_df[column_trying]) / total_count

print(f"Chance when app dedication: logged <25% and ntercourse frequency: logged <10% is {chance:.2f} %")

In [None]:
df = pd.read_pickle('NC_cleaned.pkl')

filtered_df = df[(
    (df[column_trying] <= 13) & (df['outcome'] == 'pregnant')
)]

# loop over columns and compute chance as a function of each column group
for column_name in [
    'bmi_group',
    'age_group',
    'been_pregnant_before',
    'average_cycle_length_group',
    'dedication_group',
    'cycle_length_std_group',
    'intercourse_frequency_group',
    'sleeping_pattern',
    'education',
    'regular_cycle',
]:
    
    fig, ax = plt.subplots(figsize=(8, 8))

    filtered_count = filtered_df.groupby(column_name)[column_trying].size()
    total_count = df.groupby(column_name)[column_trying].size()

    (100 * filtered_count / total_count).plot(
        kind='bar',
        ax=ax,
        color='blue',
        xlabel=column_name,
        ylabel='Chance of pregnancy\nwithin 13 cycles (%)'
    )
    plt.xticks(rotation=45)
    plt.show()

# Compute 'How long does it usually take to get pregnant?'

In [None]:
from scipy.stats import f_oneway

def anova_test(df, category_column, target_column, *categories):
    """
    Are there statistically significant 
    differences in means across groups?
    """
    groups = []
    for category in categories:
        groups.append(df[df[category_column] == category][target_column])
    f_statistic, p_value = f_oneway(*groups)
    
    return f_statistic, p_value

In [None]:
df = pd.read_pickle("NC_cleaned.pkl")

# create a dataframe of women that became pregnant after trying
filtered_df = df[(
    df['outcome'] == 'pregnant'
)]
# compute mean and median
mean = filtered_df[column_trying].mean()
median = filtered_df[column_trying].median()
print(f"Mean is {mean:.2f}\nMedian is {median:.2f}\n")

# loop over columns and compute the mean and median of each column group
for column_name in [
    'bmi_group',
    'age_group',
    'been_pregnant_before',
    'average_cycle_length_group',
    'dedication_group',
    'cycle_length_std_group',
    'intercourse_frequency_group',
    'sleeping_pattern',
    'education',
    'regular_cycle',
]:
    
    # perform ANOVA test to check if there are statistically significant differences in means across groups
    unique_groups = df[column_name].dropna().unique()
    f_statistic, p_value = anova_test(df, column_name, column_trying, *unique_groups)
    print(column_name)
    print(f'F-statistic: {f_statistic:.5f}')
    add = ' (Significant!)' if (p_value < 0.05) else ''
    print(f'P-value: {p_value:.5f}{add}\n')
    add = ' (Significant difference!)' if (p_value < 0.05) else ''
    
    # plot mean and median
    fig, ax = plt.subplots(figsize=(8, 8))
    
    median_values = filtered_df.groupby(column_name)[column_trying].mean()
    mean_values = filtered_df.groupby(column_name)[column_trying].median()
    stdev_values = filtered_df.groupby(column_name)[column_trying].std()
    
    median_values.plot(
        kind='bar',
        ax=ax,
        color='blue',
        position=1,
        align='center',
        width=0.2,
        xlabel=column_name,
        ylabel='Cycles of trying',
        label='Median',
        #yerr=stdev_values,
    )
    mean_values.plot(
        kind='bar',
        ax=ax,
        color='orange',
        position=0,
        align='center',
        width=0.2,
        label=f'Mean{add}',
        #yerr=stdev_values,
    )
    plt.xticks(rotation=45)
    plt.legend(loc='best', frameon=False)
    plt.show()

# Compute 'What factors impact the time it takes to get pregnant?'

In [None]:
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [None]:
df = pd.read_pickle('NC_cleaned_encoded.pkl')

filtered_df = df[(
    df['outcome'] == 1
)]

X = filtered_df[['bmi', 'age', 'been_pregnant_before', 'education', 'sleeping_pattern', 'dedication','average_cycle_length', 'cycle_length_std', 'regular_cycle', 'intercourse_frequency']]
y = filtered_df['n_cycles_trying']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X2 = sm.add_constant(X_scaled)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [None]:
coefficients = pd.DataFrame(list(est2.params)[1:], X.columns, columns=['coefficient'])
coefficients = coefficients.assign(pvalue=list(est2.pvalues)[1:])

print(coefficients)