In [1]:

from pymongo import MongoClient
from pymongo.collection import Collection
r = Reset()
r.reset_database()

# Import your libraries here
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import scipy
from country_converter import CountryConverter
from statsmodels.stats.contingency_tables import Table2x2
from statsmodels.stats.power import GofChisquarePower


ModuleNotFoundError: No module named 'pymongo'

In [None]:
#CONNECT
# Create `client`
client = MongoClient(host="localhost",port=27017)
# Create `db`
db = client["wqu-abtest"]
# Assign `"mscfe-applicants"` collection to `mscfe_app`
mscfe_app = db["mscfe-applicants"]

In [None]:
mscfe_app.find_one()

In [None]:
# Aggregate applicants by nationality
result = mscfe_app.aggregate(
    [
        {"$group" :
             {
                 "_id": "$countryISO2",
                 "count":{"$count":{}}
             }
        
        }
    ]
)

# Load result into DataFrame
df_nationality = pd.DataFrame(result).rename({"_id":"country_iso2"},axis=1)

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Instantiate `CountryConverter`
cc=CountryConverter()

# Create `"country_name"` column
df_nationality["country_name"] = cc.convert(
    df_nationality["country_iso2"], to="short_name"
)

# Create `"country_iso3"` column
df_nationality["country_iso3"] = cc.convert(
    df_nationality["country_iso2"], to="ISO3"
)

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Create `build_nat_choropleth` function
def build_nat_choropleth():
    fig=px.choropleth(
        data_frame= df_nationality,
        locations = "country_iso3" ,
        color = "count",
        projection = "natural earth",
        color_continuous_scale =px.colors.sequential.Oranges,
        title = "MScFE Applicants: Nationalities"
    );
    return fig
# Don't delete the code below 👇
nat_fig = build_nat_choropleth()
nat_fig.write_image("images/7-5-4.png", scale=1, height=500, width=700)

nat_fig.show()

# ETL

In [None]:
class MongoRepository:
    """Repository class for interacting with MongoDB database.

    Parameters
    ----------
    client : `pymongo.MongoClient`
        By default, `MongoClient(host='localhost', port=27017)`.
    db : str
        By default, `'wqu-abtest'`.
    collection : str
        By default, `'mscfe-applicants'`.

    Attributes
    ----------
    collection : pymongo.collection.Collection
        All data will be extracted from and loaded to this collection.
    """

    # Task 7.5.5: `__init__` method
    def __init__(self):
            client= MongoClient(host='localhost', port=27017)
            db = "wqu-abtest"
            collection = "mscfe-applicants"
            self.collection = client[db][collection]

    # Task 7.5.6: `find_by_date` method
    def find_by_date(self,date_string):
    # Convert `date_string` to datetime object
        start = pd.to_datetime(date_string,format="%Y-%m-%d")
        # Offset `start` by 1 day
        end = start + pd.DateOffset(days=1)
        # Create PyMongo query for no-quiz applicants b/t `start` and `end`
        query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"}
        # Query collection, get result
        result = self.collection.find(query)
        # Convert `result` to list
        observations = list(result)
        # REMOVE}
        return observations

    # Task 7.5.7: `update_applicants` method
    def update_applicants(self, documents):
        #initiate counters
        n = 0
        n_Modified =0
        #update individuate record iteratively
        for doc in documents:
                result = self.collection.update_one(
                filter={"_id":doc["_id"]},
                update={"$set":doc}
                )
                n+= result.matched_count
                n_Modified += result.modified_count
        transaction_result = {"n": n, "nModified": n_Modified}   
        return transaction_result
    
    # Task 7.5.7: `assign_to_groups` method
    def assign_to_groups(self,date_string):
        #Get observations
        observations = self.find_by_date(date_string)
        # Shuffle `observations`
        random.seed(42)
        random.shuffle(observations)

        # Get index position of item at observations halfway point
        idx = len(observations)//2

        # Assign first half of observations to control group
        for doc in observations[:idx]:
            doc["inExperiment"] = True
            doc["group"] = "no email (control)"
        # Assign second half of observations to treatment group
        for doc in observations[idx:]:
            doc["inExperiment"] = True
            doc["group"] = "email (treatment)"
        #update collections
        results = self.update_applicants(observations)
        return list(results) 
   
    # Task 7.5.14: `find_exp_observations` method
    def find_exp_observations(self):
        result = self.collection.find({"inExperiment": True})
        df = list(result)
        return df

In [None]:
repo = MongoRepository()
print("repo type:", type(repo))
repo

In [None]:
check = repo.find_by_date("2022-06-01")
check[:3] 

# Perpare Experiment

In [None]:
chi_square_power = GofChisquarePower()
group_size = math.ceil(chi_square_power.solve_power(effect_size=0.5,alpha=0.05,power=0.8))

print("Group size:", group_size)
print("Total # of applicants needed:", group_size * 2)

In [None]:
# Aggregate no-quiz applicants by sign-up date
result = mscfe_app.aggregate(
    [
        {"$match":{"admissionsQuiz": "incomplete"}},
        {
          "$group": {
              "_id": {"$dateTrunc":{"date":"$createdAt", "unit": "day"}},
              "count":{"$sum":1}
          }  
        }
    ]
)

# Load result into DataFrame
no_quiz_mscfe = (
    pd.DataFrame(result)
    .rename({"_id":"date","count":"new_users"},axis=1)
    .set_index("date")
    .sort_index()
    .squeeze())

print("no_quiz type:", type(no_quiz_mscfe))
print("no_quiz shape:", no_quiz_mscfe.shape)
no_quiz_mscfe.head()

In [None]:
mean = no_quiz_mscfe.describe()["mean"]
std = no_quiz_mscfe.describe()["std"]
print("no_quiz mean:", mean)
print("no_quiz std:", std)

In [None]:
n_observations = np.arange(0,group_size * 2 + 1)
effect_sizes = np.array([0.2,0.5,0.8])

# Plot power curve using `chi_square_power`
chi_square_power.plot_power(
    dep_var="nobs",
    nobs = n_observations,
    effect_size = effect_sizes,
    alpha = 0.05,
    n_bins=5
);

In [None]:
exp_days = 7
sum_mean = mean * exp_days
sum_std = std * np.sqrt(exp_days)
print("Mean of sum:", sum_mean)
print("Std of sum:", sum_std)

In [None]:
prob_65_or_fewer = scipy.stats.norm.cdf(
    group_size*2,
    loc = sum_mean,
    scale = sum_std
)
prob_65_or_greater = 1-(prob_65_or_fewer)

print(
    f"Probability of getting 65+ no_quiz in {exp_days} days:",
    round(prob_65_or_greater, 3),
)

In [None]:
exp = Experiment(repo=client, db="wqu-abtest", collection="mscfe-applicants")
exp.reset_experiment()
result = exp.run_experiment(days=exp_days, assignment=True)
print("result type:", type(result))
result

# Analyse Result

In [None]:
result = repo.find_exp_observations()
df = pd.DataFrame(result).dropna()

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

In [None]:
data = pd.crosstab(
    index=df["group"],
    columns=df["admissionsQuiz"],
    normalize = False
)

print("data type:", type(data))
print("data shape:", data.shape)
data

In [None]:
# Create `build_contingency_bar` function
def build_contingency_bar():
    fig = px.bar(
        data,
        barmode="group",
        title= "MScFE: Admissions Quiz Completion by Group"
    )
    fig.update_layout(xaxis_title="Group",yaxis_title="Frequency [count]")
    return fig
# Don't delete the code below 👇
cb_fig = build_contingency_bar()
cb_fig.write_image("images/7-5-16.png", scale=1, height=500, width=700)

cb_fig.show()

In [None]:
contingency_table = Table2x2(data.values)

print("contingency_table type:", type(contingency_table))
contingency_table.table_orig

In [None]:
chi_square_test = contingency_table.test_nominal_association()

print("chi_square_test type:", type(chi_square_test))
print(chi_square_test)

In [None]:
odds_ratio = contingency_table.oddsratio.round(1)
print("Odds ratio:", odds_ratio)