<h1>Reinforcement and device-based questionnaire adaptativity results</h1>

In [1]:
import pandas as pd
import requests
import io
import numpy as np
from scipy import stats

from IPython.display import display
pd.set_option('display.max_columns', None)

<h2>General results (BEFORE reinforcement)</h2>

<p>General results <b>before</b> the rules were applied.</p>

In [2]:
data_before_reinf = pd.read_csv("dataBeforeReinforcement.csv", low_memory=False)

<h3>Overall results taking into account only the users that have started the questionnaire</h3>

In [3]:
total_started_questionnaires_before_reinf = data_before_reinf[data_before_reinf["cuestionarioIniciado"] == True].shape[0]
print "Number of students that have STARTED the questionnaire: " + str(total_started_questionnaires_before_reinf)

uncompleted_questionnaires_before_reinf = data_before_reinf[(data_before_reinf["cuestionarioIniciado"] == True) & (data_before_reinf["cuestionarioFinalizado"] == False)].shape[0]
print "Number of students that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_before_reinf)

completed_questionnaires_before_reinf = data_before_reinf[(data_before_reinf["cuestionarioIniciado"] == True) & (data_before_reinf["cuestionarioFinalizado"] == True)].shape[0]
print "Number of students that have finished the questionnaire: " + str(completed_questionnaires_before_reinf)

completion_rate_before_reinf = 100 * float(completed_questionnaires_before_reinf) / total_started_questionnaires_before_reinf
print "\nCompletion rate BEFORE reinforcement: " + '% .2f' % completion_rate_before_reinf + "%"

Number of students that have STARTED the questionnaire: 5768
Number of students that have NOT finished the questionnaire: 1358
Number of students that have finished the questionnaire: 4410

Completion rate BEFORE reinforcement:  76.46%


<h3>Overall results taking into account all the users that have clicked the questionnaire link (started or not)</h3>

In [4]:
total_started_questionnaires_before_reinf_2 = data_before_reinf[(data_before_reinf["verticalAsignado"] != 0)].shape[0]
print "Number of students that have ENTERED the questionnaire: " + str(total_started_questionnaires_before_reinf_2)

uncompleted_questionnaires_before_reinf_2 = data_before_reinf[(data_before_reinf["verticalAsignado"] != 0) & (data_before_reinf["cuestionarioFinalizado"] == False)].shape[0]
print "Number of students that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_before_reinf_2)

completed_questionnaires_before_reinf_2 = data_before_reinf[(data_before_reinf["verticalAsignado"] != 0) & (data_before_reinf["cuestionarioFinalizado"] == True)].shape[0]
print "Number of students that have finished the questionnaire: " + str(completed_questionnaires_before_reinf_2)

completion_rate_before_reinf_2 = 100 * float(completed_questionnaires_before_reinf_2) / total_started_questionnaires_before_reinf_2
print "\nCompletion rate BEFORE reinforcement: " + '% .2f' % completion_rate_before_reinf_2 + "%"

Number of students that have ENTERED the questionnaire: 6360
Number of students that have NOT finished the questionnaire: 1950
Number of students that have finished the questionnaire: 4410

Completion rate BEFORE reinforcement:  69.34%


<h2>General results (AFTER reinforcement)</h2>

<p>General results <b>after</b> the rules were applied.</p>

In [5]:
with open("tokenOEEU.txt") as file: 
    api_token = file.read() 

In [6]:
data_after_reinf = requests.get('https://datos.oeeu.org/api/master/estudiantes/',headers={'Authorization': api_token}).content
data_after_reinf = pd.read_csv(io.StringIO(data_after_reinf.decode('utf-8')), low_memory=False, sep=";")

<h3>Overall results taking into account only the users that have started the questionnaire</h3>

In [7]:
total_started_questionnaires_after_reinf = data_after_reinf[data_after_reinf["cuestionarioIniciado"] == True].shape[0]
print "Number of students that have STARTED the questionnaire: " + str(total_started_questionnaires_after_reinf)

uncompleted_questionnaires_after_reinf = data_after_reinf[(data_after_reinf["cuestionarioIniciado"] == True) & (data_after_reinf["cuestionarioFinalizado"] == False)].shape[0]
print "Number of students that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_after_reinf)

completed_questionnaires_after_reinf = data_after_reinf[(data_after_reinf["cuestionarioIniciado"] == True) & (data_after_reinf["cuestionarioFinalizado"] == True)].shape[0]
print "Number of students that have finished the questionnaire: " + str(completed_questionnaires_after_reinf)

completion_rate_after_reinf = 100 * float(completed_questionnaires_after_reinf) / total_started_questionnaires_after_reinf
print "\nCompletion rate AFTER reinforcement: " + '% .2f' % completion_rate_after_reinf + "%"

Number of students that have STARTED the questionnaire: 6738
Number of students that have NOT finished the questionnaire: 1524
Number of students that have finished the questionnaire: 5214

Completion rate AFTER reinforcement:  77.38%


<h3>Overall results taking into account all the users that have clicked the questionnaire link (started or not)</h3>

In [8]:
total_started_questionnaires_after_reinf_2 = data_after_reinf[data_after_reinf["verticalAsignado"] != 0].shape[0]
print "Number of students that have ENTERED the questionnaire: " + str(total_started_questionnaires_after_reinf_2)

uncompleted_questionnaires_after_reinf_2 = data_after_reinf[(data_after_reinf["verticalAsignado"] != 0) & (data_after_reinf["cuestionarioFinalizado"] == False)].shape[0]
print "Number of students that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_after_reinf_2)

completed_questionnaires_after_reinf_2 = data_after_reinf[(data_after_reinf["verticalAsignado"] != 0) & (data_after_reinf["cuestionarioFinalizado"] == True)].shape[0]
print "Number of students that have finished the questionnaire: " + str(completed_questionnaires_after_reinf_2)

completion_rate_after_reinf_2 = 100 * float(completed_questionnaires_after_reinf_2) / total_started_questionnaires_after_reinf_2
print "\nCompletion rate AFTER reinforcement: " + '% .2f' % completion_rate_after_reinf_2 + "%"

Number of students that have ENTERED the questionnaire: 7349
Number of students that have NOT finished the questionnaire: 2135
Number of students that have finished the questionnaire: 5214

Completion rate AFTER reinforcement:  70.95%


<h2>Device-based questionnaire variant selection</h2>

<h3>Device data gathering (after and before the reinforcement)</h3>

In [9]:
env_paradata_before_reinf = pd.read_csv("deviceDataBeforeReinforcement.csv", low_memory=False)

'''
We get the device data from the most recent access of the paradata BEFORE before reinforcement.
The reason is because in case the user used different devices, we take into account the one with which the user
abandoned the questionnaire (and, therefore, the device which the user made the last access with)
'''
env_paradata_before_reinf = env_paradata_before_reinf.drop_duplicates(subset=["estudiante_id"], keep="last")
env_paradata_before_reinf = env_paradata_before_reinf[["estudiante_id", "device_pixel_ratio", "device_screen_height", "device_screen_width", "tablet_or_mobile", "viewport_width", "viewport_height", "os"]]

In [10]:
env_paradata_after_reinf = requests.get('https://datos.oeeu.org/api/master/paradata_entorno/',headers={'Authorization': api_token}).content
env_paradata_after_reinf = pd.read_csv(io.StringIO(env_paradata_after_reinf.decode('utf-8')), low_memory=False, sep=";")

'''
We get the device data from the first access of the paradata dataset AFTER the reinforcement date (2017-07-16),
because the redirection rules are applied based on the device with which the user returned to the questionnaire.
'''
env_paradata_after_reinf = env_paradata_after_reinf[env_paradata_after_reinf["sesion__tiempoInicio_sesion"] > "2017-07-16"]
env_paradata_after_reinf = env_paradata_after_reinf.drop_duplicates(subset=["estudiante_id"], keep="first")
env_paradata_after_reinf = env_paradata_after_reinf[["estudiante_id", "device_pixel_ratio", "device_screen_height", "device_screen_width", "tablet_or_mobile", "viewport_width", "viewport_height", "os"]]

Before merging the paradata and students' datastets, we first clean the students' datasets in order to keep only the variables that are
necessary to obtain the completion rate results:
- The vertical assigned (verticalAsignado)
- The boolean variables about the questionnaires' state: started (cuestionarioIniciado) and finalized (cuestionarioFinalizado)

In [11]:
data_before_reinf = data_before_reinf[['estudiante_id', 'verticalAsignado', 'cuestionarioFinalizado', 'cuestionarioIniciado']]
data_after_reinf = data_after_reinf[['estudiante_id', 'verticalAsignado', 'cuestionarioFinalizado', 'cuestionarioIniciado']]

Finally, we merge the datasets to obtain a single dataset containing the device and questionnaires' variables
before and after the reinforcement took place, in order to facilitate the comparison of these variables

In [12]:
results_before_reinf = data_before_reinf.merge(env_paradata_before_reinf, how='left', on='estudiante_id')
results_after_reinf = data_after_reinf.merge(env_paradata_after_reinf, how='left', on='estudiante_id')

In [13]:
full_dataset = results_before_reinf.merge(results_after_reinf, how='left', on='estudiante_id', suffixes=("_before_reinf", "_after_reinf"))

<h3>Identification of target users (users that entered the questionnaire after the reinforcement date, 07/16/2017)</h3>

We keep only the sessions started after the reinforcement date in order to identify all the users to which the
experiment was applied

In [14]:
session_paradata = requests.get('https://datos.oeeu.org/api/master/sesiones-estudiantes/',headers={'Authorization': api_token}).content
session_paradata = pd.read_csv(io.StringIO(session_paradata.decode('utf-8')), low_memory=False, sep=";")

new_sessions_after_reinf = session_paradata[(session_paradata["tiempoInicio_sesion"] > "2017-07-16")].drop_duplicates(subset=["estudiante_id"], keep="first").copy()

We keep only the users that had not finished the questionnaire before the reinforcement took place (target users)

In [15]:
full_dataset = full_dataset.merge(new_sessions_after_reinf, on="estudiante_id")
full_dataset = full_dataset[full_dataset["cuestionarioFinalizado_before_reinf"] == False]

<h3>Identification of the different types of users affected by the reinforcement and redirection rules</h3>

<b>Users that had started the questionnaire after the reinforcement message.</b>
We identify them through the following characteristics:
- They had not started the questionnaire before reinforcement (cuestionarioIniciado_before_reinf == False)
- They didn't have a questionnaire vertical assigned (verticalAsignado_before_reinf == 0)
- They started the questionnaire after reinforcement (cuestionarioIniciado_after_reinf == True)

In [16]:
new_users = full_dataset[(full_dataset["cuestionarioIniciado_before_reinf"] == False) & (full_dataset["verticalAsignado_before_reinf"] == 0)].copy()

<b>Users that resumed the questionnaire and had been redirected to a different questionnaire vertical.</b>
We identify them through the following characteristics:
- They did have a questionnaire vertical assigned before reinforcement (verticalAsignado_before_reinf != 0)
- Their questionnaire vertical assigned after reinforcement differs from the previous one, and therefore, they
have been redirected to another questionnaire vertical (verticalAsignado_before_reinf != verticalAsignado_after_reinf)

In [17]:
redirected_users = full_dataset[(full_dataset["verticalAsignado_before_reinf"] != 0) & (full_dataset["verticalAsignado_before_reinf"] != full_dataset["verticalAsignado_after_reinf"])].copy()

<b>Users that resumed the questionnaire and had NOT been redirected to a different questionnaire vertical.</b>
We identify them through the following characteristics:
- They did have a questionnaire vertical assigned before reinforcement (verticalAsignado_before_reinf != 0)
- Their questionnaire vertical assigned after reinforcement is the same as the previous one, and therefore, they
have NOT been redirected to another questionnaire vertical (verticalAsignado_before_reinf == verticalAsignado_after_reinf)

In [18]:
not_redirected_users = full_dataset[(full_dataset["verticalAsignado_before_reinf"] != 0) & (full_dataset["verticalAsignado_before_reinf"] == full_dataset["verticalAsignado_after_reinf"])].copy()

<h3>General numbers after reinforcement and rules' application</h3>

In [19]:
new_users_number = new_users.shape[0]
print "Number of users that started the questionnaire after reinforcement: " + str(new_users_number)

redirected_users_number = redirected_users.shape[0]
print "Number of users that resumed the questionnaire after reinforcement and had been redirected: " + str(redirected_users_number)

not_redirected_users_number = not_redirected_users.shape[0]
print "Number of users that resumed the questionnaire after reinforcement and had NOT been redirected: " + str(not_redirected_users_number)

Number of users that started the questionnaire after reinforcement: 1003
Number of users that resumed the questionnaire after reinforcement and had been redirected: 110
Number of users that resumed the questionnaire after reinforcement and had NOT been redirected: 52


<h3>General results after reinforcement and rules' application</h3>

<h4>New users' completion rate</h4>

In [20]:
print "Number of new users that have started the questionnaire: " + str(new_users_number)

new_users_uncompleted_questionnaires = new_users[(new_users["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of new users that have NOT finished the questionnaire: " + str(new_users_uncompleted_questionnaires)

new_users_completed_questionnaires = new_users[(new_users["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of new users that have finished the questionnaire: " + str(new_users_completed_questionnaires)

new_users_completion_rate = 100 * float(new_users_completed_questionnaires) / new_users_number
print "\nCompletion rate of new users: " + '% .2f' % new_users_completion_rate + "%"

Number of new users that have started the questionnaire: 1003
Number of new users that have NOT finished the questionnaire: 285
Number of new users that have finished the questionnaire: 718

Completion rate of new users:  71.59%


<h4>Redirected users' completion rate</h4>

In [21]:
print "Number of redirected users that have started the questionnaire: " + str(redirected_users_number)

redirected_users_uncompleted_questionnaires = redirected_users[(redirected_users["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of redirected users that have NOT finished the questionnaire: " + str(redirected_users_uncompleted_questionnaires)

redirected_users_completed_questionnaires = redirected_users[(redirected_users["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of redirected users that have finished the questionnaire: " + str(redirected_users_completed_questionnaires)

redirected_users_completion_rate = 100 * float(redirected_users_completed_questionnaires) / redirected_users_number
print "\nCompletion rate of redirected users: " + '% .2f' % redirected_users_completion_rate + "%"

Number of redirected users that have started the questionnaire: 110
Number of redirected users that have NOT finished the questionnaire: 49
Number of redirected users that have finished the questionnaire: 61

Completion rate of redirected users:  55.45%


<h4>Not redirected users' completion rate</h4>

In [22]:
print "Number of not redirected users that have started the questionnaire: " + str(not_redirected_users_number)

not_redirected_users_uncompleted_questionnaires = not_redirected_users[(not_redirected_users["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of not redirected users that have NOT finished the questionnaire: " + str(not_redirected_users_uncompleted_questionnaires)

not_redirected_users_completed_questionnaires = not_redirected_users[(not_redirected_users["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of not redirected users that have finished the questionnaire: " + str(not_redirected_users_completed_questionnaires)

not_redirected_users_completion_rate = 100 * float(not_redirected_users_completed_questionnaires) / not_redirected_users_number
print "\nCompletion rate of not redirected users: " + '% .2f' % not_redirected_users_completion_rate + "%"

Number of not redirected users that have started the questionnaire: 52
Number of not redirected users that have NOT finished the questionnaire: 27
Number of not redirected users that have finished the questionnaire: 25

Completion rate of not redirected users:  48.08%


<h3>Analysis of the rules application impact</h3>

<h4>Identification of the rule that would have be applied to pre-reinforcement users</h4>

In [23]:
pre_reinforcement_users = results_before_reinf[results_before_reinf["verticalAsignado"] != 0].copy()

In [24]:
def rule_identification_pre(x):
    if np.isnan(x.tablet_or_mobile) or np.isnan(x.device_pixel_ratio) or np.isnan(x.device_screen_height) or np.isnan(x.device_screen_width):
        return 9
    elif x.os == "Mac OS":
        return 5
    elif x.os != "iOS" and x.os != "Android" and x.os != "Mac OS":
        return 4
    elif x.os == "Android" and x.tablet_or_mobile == False:
        return 8
    elif x.os == "Android" and x.device_pixel_ratio == 2:
        return 1
    elif x.os == "Android" and (x.device_pixel_ratio == 3 or x.device_pixel_ratio == 4):
        return 2
    elif x.os == "iOS" and x.device_pixel_ratio == 3:
        return 3
    elif x.os == "Android" and x.device_pixel_ratio > 1000:
        return 6
    elif x.os == "iOS" and x.device_screen_height == 1024 and x.device_screen_width == 768 and (x.device_pixel_ratio == 1 or x.device_pixel_ratio == 2):
        return 7
    else:
        return 9

In [25]:
pre_reinforcement_users["applied_rule"] = pre_reinforcement_users.apply(rule_identification_pre, axis = 1)

<h4>After identifying the rule, we compare it to the vertical that was randomly assigned to obtain the G1 and G2 users' groups</h4>

In [26]:
def assigned_vertical_eval(x):
    if x.applied_rule == 1:
        right_vertical = 1
    elif x.applied_rule == 2:
        right_vertical = 2
    elif x.applied_rule == 3:
        right_vertical = 2
    elif x.applied_rule == 4:
        right_vertical = 1
    elif x.applied_rule == 5:
        right_vertical = 1
    elif x.applied_rule == 6:
        right_vertical = 2
    elif x.applied_rule == 7:
        right_vertical = 1
    elif x.applied_rule == 8:
        right_vertical = 1
    elif x.applied_rule == 9:
        if x.verticalAsignado == 3:
            return 0
        else:
            # lost values
            return -99
    
    if right_vertical == x.verticalAsignado:
        return 1
    else:
        return 0

In [27]:
pre_reinforcement_users["rightVertical"] = pre_reinforcement_users.apply(assigned_vertical_eval, axis = 1)

<h4>Group 1 users finalization rate</h4>

In [28]:
g1 = pre_reinforcement_users[pre_reinforcement_users["rightVertical"] == 0].copy()

In [29]:
total_started_questionnaires_g1 = g1.shape[0]
print "Number of G1 users that ENTERED the questionnaire: " + str(total_started_questionnaires_g1)

uncompleted_questionnaires_g1 = g1[(g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_g1)

completed_questionnaires_g1 = g1[(g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire: " + str(completed_questionnaires_g1)

completion_rate_g1 = 100 * float(completed_questionnaires_g1) / total_started_questionnaires_g1
print "\nCompletion rate of G1 users: " + '% .2f' % completion_rate_g1 + "%"

Number of G1 users that ENTERED the questionnaire: 3833
Number of G1 users that have NOT finished the questionnaire: 1232
Number of G1 users that have finished the questionnaire: 2601

Completion rate of G1 users:  67.86%


<h4>Group 2 users finalization rate</h4>

In [30]:
g2 = pre_reinforcement_users[pre_reinforcement_users["rightVertical"] == 1].copy()

In [31]:
total_started_questionnaires_g2 = g2.shape[0]
print "Number of G2 users that ENTERED the questionnaire: " + str(total_started_questionnaires_g2)

uncompleted_questionnaires_g2 = g2[(g2["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G2 users that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_g2)

completed_questionnaires_g2 = g2[(g2["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G2 users that have finished the questionnaire: " + str(completed_questionnaires_g2)

completion_rate_g2 = 100 * float(completed_questionnaires_g2) / total_started_questionnaires_g2
print "\nCompletion rate of G2 users: " + '% .2f' % completion_rate_g2 + "%"

Number of G2 users that ENTERED the questionnaire: 1542
Number of G2 users that have NOT finished the questionnaire: 387
Number of G2 users that have finished the questionnaire: 1155

Completion rate of G2 users:  74.90%


<h4>Group 3 users finalization rate</h4>

In [32]:
g3 = new_users.copy()

In [33]:
def rule_identification_post(x):
    if np.isnan(x.tablet_or_mobile_after_reinf) or np.isnan(x.device_pixel_ratio_after_reinf) or np.isnan(x.device_screen_height_after_reinf) or np.isnan(x.device_screen_width_after_reinf):
        return 9
    elif x.os_after_reinf == "Mac OS":
        return 5
    elif x.os_after_reinf != "iOS" and x.os_after_reinf != "Android" and x.os_after_reinf != "Mac OS":
        return 4
    elif x.os_after_reinf == "Android" and x.tablet_or_mobile_after_reinf == False:
        return 8
    elif x.os_after_reinf == "Android" and x.device_pixel_ratio_after_reinf == 2:
        return 1
    elif x.os_after_reinf == "Android" and (x.device_pixel_ratio_after_reinf == 3 or x.device_pixel_ratio_after_reinf == 4):
        return 2
    elif x.os_after_reinf == "iOS" and x.device_pixel_ratio_after_reinf == 3:
        return 3
    elif x.os_after_reinf == "Android" and x.device_pixel_ratio_after_reinf > 1000:
        return 6
    elif x.os_after_reinf == "iOS" and x.device_screen_height_after_reinf == 1024 and x.device_screen_width_after_reinf == 768 and (x.device_pixel_ratio_after_reinf == 1 or x.device_pixel_ratio_after_reinf == 2):
        return 7
    else:
        return 9

In [34]:
g3["applied_rule"] = g3.apply(rule_identification_post, axis = 1)

In [35]:
total_started_questionnaires_g3 = g3.shape[0]
print "Number of G3 users that ENTERED the questionnaire: " + str(total_started_questionnaires_g3)

uncompleted_questionnaires_g3 = g3[(g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire: " + str(uncompleted_questionnaires_g3)

completed_questionnaires_g3 = g3[(g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire: " + str(completed_questionnaires_g3)

completion_rate_g3 = 100 * float(completed_questionnaires_g3) / total_started_questionnaires_g3
print "\nCompletion rate of G3 users: " + '% .2f' % completion_rate_g3 + "%"

Number of G3 users that ENTERED the questionnaire: 1003
Number of G3 users that have NOT finished the questionnaire: 285
Number of G3 users that have finished the questionnaire: 718

Completion rate of G3 users:  71.59%


<h3>Correlation between the questionnaire vertical assignation and the finalization rate</h3>

<h4>G1-G2 correlation</h4>

In [36]:
nFinalized = pd.Series([completed_questionnaires_g1, completed_questionnaires_g2], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1, uncompleted_questionnaires_g2], name='Not finished')
confMatrix = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrix.columns = ['G1', 'G2']
confMatrix

Unnamed: 0,G1,G2
Finished,2601,1155
Not finished,1232,387


In [37]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrix, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 25.9267429864
Significance 3.54622117446e-07


<h4>G2-G3 correlation</h4>

In [38]:
nFinalized = pd.Series([completed_questionnaires_g2, completed_questionnaires_g3], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g2, uncompleted_questionnaires_g3], name='Not finished')
confMatrix2 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrix2.columns = ['G2', 'G3']
confMatrix2

Unnamed: 0,G2,G3
Finished,1155,718
Not finished,387,285


In [39]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrix2, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 3.44177543362
Significance 0.0635673487709


<h4>G1-G3 correlation</h4>

In [40]:
nFinalized = pd.Series([completed_questionnaires_g1, completed_questionnaires_g3], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1, uncompleted_questionnaires_g3], name='Not finished')
confMatrix3 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrix3.columns = ['G1', 'G3']
confMatrix3

Unnamed: 0,G1,G3
Finished,2601,718
Not finished,1232,285


In [41]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrix3, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 5.12969488294
Significance 0.023519861893


<h3>Correlation between the rules application and the finalization rate</h3>

<h4>Rule number 1 (G1-G3)</h4>

In [42]:
total_started_questionnaires_g1_r1 = g1[g1["applied_rule"] == 1].shape[0]
print "Number of G1 users (with rule 1 applied): " + str(total_started_questionnaires_g1_r1)

uncompleted_questionnaires_g1_r1 = g1[(g1["applied_rule"] == 1) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 1 applied): " + str(uncompleted_questionnaires_g1_r1)

completed_questionnaires_g1_r1 = g1[(g1["applied_rule"] == 1) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 1 applied): " + str(completed_questionnaires_g1_r1)

completion_rate_g1_r1 = 100 * float(completed_questionnaires_g1_r1) / total_started_questionnaires_g1_r1
print "\nCompletion rate of G1 users (with rule 1 applied): " + '% .2f' % completion_rate_g1_r1 + "%"

Number of G1 users (with rule 1 applied): 374
Number of G1 users that have NOT finished the questionnaire (with rule 1 applied): 123
Number of G1 users that have finished the questionnaire (with rule 1 applied): 251

Completion rate of G1 users (with rule 1 applied):  67.11%


In [43]:
total_started_questionnaires_g3_r1 = g3[g3["applied_rule"] == 1].shape[0]
print "Number of G3 users (with rule 1 applied): " + str(total_started_questionnaires_g3_r1)

uncompleted_questionnaires_g3_r1 = g3[(g3["applied_rule"] == 1) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 1 applied): " + str(uncompleted_questionnaires_g3_r1)

completed_questionnaires_g3_r1 = g3[(g3["applied_rule"] == 1) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 1 applied): " + str(completed_questionnaires_g3_r1)

completion_rate_g3_r1 = 100 * float(completed_questionnaires_g3_r1) / total_started_questionnaires_g3_r1
print "\nCompletion rate of G3 users (with rule 1 applied): " + '% .2f' % completion_rate_g3_r1 + "%"

Number of G3 users (with rule 1 applied): 106
Number of G3 users that have NOT finished the questionnaire (with rule 1 applied): 29
Number of G3 users that have finished the questionnaire (with rule 1 applied): 77

Completion rate of G3 users (with rule 1 applied):  72.64%


In [44]:
nFinalized = pd.Series([completed_questionnaires_g1_r1, completed_questionnaires_g3_r1], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r1, uncompleted_questionnaires_g3_r1], name='Not finished')
confMatrixR1 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR1.columns = ['G1_Rule1', 'G3_Rule1']
confMatrixR1

Unnamed: 0,G1_Rule1,G3_Rule1
Finished,251,77
Not finished,123,29


In [45]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR1, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 1.16688333615
Significance 0.280042557643


<h4>Rule number 2 (G1-G3)</h4>

In [46]:
total_started_questionnaires_g1_r2 = g1[g1["applied_rule"] == 2].shape[0]
print "Number of G1 users (with rule 2 applied): " + str(total_started_questionnaires_g1_r2)

uncompleted_questionnaires_g1_r2 = g1[(g1["applied_rule"] == 2) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 2 applied): " + str(uncompleted_questionnaires_g1_r2)

completed_questionnaires_g1_r2 = g1[(g1["applied_rule"] == 2) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 2 applied): " + str(completed_questionnaires_g1_r2)

completion_rate_g1_r2 = 100 * float(completed_questionnaires_g1_r2) / total_started_questionnaires_g1_r2
print "\nCompletion rate of G1 users (with rule 2 applied): " + '% .2f' % completion_rate_g1_r2 + "%"

Number of G1 users (with rule 2 applied): 362
Number of G1 users that have NOT finished the questionnaire (with rule 2 applied): 105
Number of G1 users that have finished the questionnaire (with rule 2 applied): 257

Completion rate of G1 users (with rule 2 applied):  70.99%


In [47]:
total_started_questionnaires_g3_r2 = g3[g3["applied_rule"] == 2].shape[0]
print "Number of G3 users (with rule 2 applied): " + str(total_started_questionnaires_g3_r2)

uncompleted_questionnaires_g3_r2 = g3[(g3["applied_rule"] == 2) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 2 applied): " + str(uncompleted_questionnaires_g3_r2)

completed_questionnaires_g3_r2 = g3[(g3["applied_rule"] == 2) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 2 applied): " + str(completed_questionnaires_g3_r2)

completion_rate_g3_r2 = 100 * float(completed_questionnaires_g3_r2) / total_started_questionnaires_g3_r2
print "\nCompletion rate of G3 users (with rule 2 applied): " + '% .2f' % completion_rate_g3_r2 + "%"

Number of G3 users (with rule 2 applied): 126
Number of G3 users that have NOT finished the questionnaire (with rule 2 applied): 35
Number of G3 users that have finished the questionnaire (with rule 2 applied): 91

Completion rate of G3 users (with rule 2 applied):  72.22%


In [48]:
nFinalized = pd.Series([completed_questionnaires_g1_r2, completed_questionnaires_g3_r2], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r2, uncompleted_questionnaires_g3_r2], name='Not finished')
confMatrixR2 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR2.columns = ['G1_Rule2', 'G3_Rule2']
confMatrixR2

Unnamed: 0,G1_Rule2,G3_Rule2
Finished,257,91
Not finished,105,35


In [49]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR2, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 0.0688667329932
Significance 0.792994017496


<h4>Rule number 3 (G1-G3)</h4>

In [50]:
total_started_questionnaires_g1_r3 = g1[g1["applied_rule"] == 3].shape[0]
print "Number of G1 users (with rule 3 applied): " + str(total_started_questionnaires_g1_r3)

uncompleted_questionnaires_g1_r3 = g1[(g1["applied_rule"] == 3) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 3 applied): " + str(uncompleted_questionnaires_g1_r3)

completed_questionnaires_g1_r3 = g1[(g1["applied_rule"] == 3) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 3 applied): " + str(completed_questionnaires_g1_r3)

completion_rate_g1_r3 = 100 * float(completed_questionnaires_g1_r3) / total_started_questionnaires_g1_r3
print "\nCompletion rate of G1 users (with rule 3 applied): " + '% .2f' % completion_rate_g1_r3 + "%"

Number of G1 users (with rule 3 applied): 51
Number of G1 users that have NOT finished the questionnaire (with rule 3 applied): 19
Number of G1 users that have finished the questionnaire (with rule 3 applied): 32

Completion rate of G1 users (with rule 3 applied):  62.75%


In [51]:
total_started_questionnaires_g3_r3 = g3[g3["applied_rule"] == 3].shape[0]
print "Number of G3 users (with rule 3 applied): " + str(total_started_questionnaires_g3_r3)

uncompleted_questionnaires_g3_r3 = g3[(g3["applied_rule"] == 3) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 3 applied): " + str(uncompleted_questionnaires_g3_r3)

completed_questionnaires_g3_r3 = g3[(g3["applied_rule"] == 3) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 3 applied): " + str(completed_questionnaires_g3_r3)

completion_rate_g3_r3 = 100 * float(completed_questionnaires_g3_r3) / total_started_questionnaires_g3_r3
print "\nCompletion rate of G3 users (with rule 3 applied): " + '% .2f' % completion_rate_g3_r3 + "%"

Number of G3 users (with rule 3 applied): 23
Number of G3 users that have NOT finished the questionnaire (with rule 3 applied): 10
Number of G3 users that have finished the questionnaire (with rule 3 applied): 13

Completion rate of G3 users (with rule 3 applied):  56.52%


In [52]:
nFinalized = pd.Series([completed_questionnaires_g1_r3, completed_questionnaires_g3_r3], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r3, uncompleted_questionnaires_g3_r3], name='Not finished')
confMatrixR3 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR3.columns = ['G1_Rule3', 'G3_Rule3']
confMatrixR3

Unnamed: 0,G1_Rule3,G3_Rule3
Finished,32,13
Not finished,19,10


In [53]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR3, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 0.257613676822
Significance 0.611764350995


<h4>Rule number 4 (G1-G3)</h4>

In [54]:
total_started_questionnaires_g1_r4 = g1[g1["applied_rule"] == 4].shape[0]
print "Number of G1 users (with rule 4 applied): " + str(total_started_questionnaires_g1_r4)

uncompleted_questionnaires_g1_r4 = g1[(g1["applied_rule"] == 4) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 4 applied): " + str(uncompleted_questionnaires_g1_r4)

completed_questionnaires_g1_r4 = g1[(g1["applied_rule"] == 4) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 4 applied): " + str(completed_questionnaires_g1_r4)

completion_rate_g1_r4 = 100 * float(completed_questionnaires_g1_r4) / total_started_questionnaires_g1_r4
print "\nCompletion rate of G1 users (with rule 4 applied): " + '% .2f' % completion_rate_g1_r4 + "%"

Number of G1 users (with rule 4 applied): 2196
Number of G1 users that have NOT finished the questionnaire (with rule 4 applied): 702
Number of G1 users that have finished the questionnaire (with rule 4 applied): 1494

Completion rate of G1 users (with rule 4 applied):  68.03%


In [55]:
total_started_questionnaires_g3_r4 = g3[g3["applied_rule"] == 4].shape[0]
print "Number of G3 users (with rule 4 applied): " + str(total_started_questionnaires_g3_r4)

uncompleted_questionnaires_g3_r4 = g3[(g3["applied_rule"] == 4) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 4 applied): " + str(uncompleted_questionnaires_g3_r4)

completed_questionnaires_g3_r4 = g3[(g3["applied_rule"] == 4) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 3 applied): " + str(completed_questionnaires_g3_r4)

completion_rate_g3_r4 = 100 * float(completed_questionnaires_g3_r4) / total_started_questionnaires_g3_r4
print "\nCompletion rate of G3 users (with rule 4 applied): " + '% .2f' % completion_rate_g3_r4 + "%"

Number of G3 users (with rule 4 applied): 421
Number of G3 users that have NOT finished the questionnaire (with rule 4 applied): 100
Number of G3 users that have finished the questionnaire (with rule 3 applied): 321

Completion rate of G3 users (with rule 4 applied):  76.25%


In [56]:
nFinalized = pd.Series([completed_questionnaires_g1_r4, completed_questionnaires_g3_r4], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r4, uncompleted_questionnaires_g3_r4], name='Not finished')
confMatrixR4 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR4.columns = ['G1_Rule4', 'G3_Rule4']
confMatrixR4

Unnamed: 0,G1_Rule4,G3_Rule4
Finished,1494,321
Not finished,702,100


In [57]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR4, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 11.2150794442
Significance 0.000811353363034


<h4>Rule number 5 (G1-G3)</h4>

In [58]:
total_started_questionnaires_g1_r5 = g1[g1["applied_rule"] == 5].shape[0]
print "Number of G1 users (with rule 5 applied): " + str(total_started_questionnaires_g1_r5)

uncompleted_questionnaires_g1_r5 = g1[(g1["applied_rule"] == 5) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 5 applied): " + str(uncompleted_questionnaires_g1_r5)

completed_questionnaires_g1_r5 = g1[(g1["applied_rule"] == 5) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 5 applied): " + str(completed_questionnaires_g1_r5)

completion_rate_g1_r5 = 100 * float(completed_questionnaires_g1_r5) / total_started_questionnaires_g1_r5
print "\nCompletion rate of G1 users (with rule 5 applied): " + '% .2f' % completion_rate_g1_r5 + "%"

Number of G1 users (with rule 5 applied): 325
Number of G1 users that have NOT finished the questionnaire (with rule 5 applied): 92
Number of G1 users that have finished the questionnaire (with rule 5 applied): 233

Completion rate of G1 users (with rule 5 applied):  71.69%


In [59]:
total_started_questionnaires_g3_r5 = g3[g3["applied_rule"] == 5].shape[0]
print "Number of G3 users (with rule 5 applied): " + str(total_started_questionnaires_g3_r5)

uncompleted_questionnaires_g3_r5 = g3[(g3["applied_rule"] == 5) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 5 applied): " + str(uncompleted_questionnaires_g3_r5)

completed_questionnaires_g3_r5 = g3[(g3["applied_rule"] == 5) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 5 applied): " + str(completed_questionnaires_g3_r5)

completion_rate_g3_r5 = 100 * float(completed_questionnaires_g3_r5) / total_started_questionnaires_g3_r5
print "\nCompletion rate of G3 users (with rule 5 applied): " + '% .2f' % completion_rate_g3_r5 + "%"

Number of G3 users (with rule 5 applied): 49
Number of G3 users that have NOT finished the questionnaire (with rule 5 applied): 13
Number of G3 users that have finished the questionnaire (with rule 5 applied): 36

Completion rate of G3 users (with rule 5 applied):  73.47%


In [60]:
nFinalized = pd.Series([completed_questionnaires_g1_r5, completed_questionnaires_g3_r5], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r5, uncompleted_questionnaires_g3_r5], name='Not finished')
confMatrixR5 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR5.columns = ['G1_Rule5', 'G3_Rule5']
confMatrixR5

Unnamed: 0,G1_Rule5,G3_Rule5
Finished,233,36
Not finished,92,13


In [61]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR5, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 0.0665922138454
Significance 0.796364713353


<h4>Rule number 6 (G1-G3)</h4><br>
There are no users with rule 6 applied in G1 and G2

<h4>Rule number 7 (G1-G3)</h4>

In [62]:
total_started_questionnaires_g1_r7 = g1[g1["applied_rule"] == 7].shape[0]
print "Number of G1 users (with rule 7 applied): " + str(total_started_questionnaires_g1_r7)

uncompleted_questionnaires_g1_r7 = g1[(g1["applied_rule"] == 7) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 7 applied): " + str(uncompleted_questionnaires_g1_r7)

completed_questionnaires_g1_r7 = g1[(g1["applied_rule"] == 7) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 7 applied): " + str(completed_questionnaires_g1_r7)

completion_rate_g1_r7 = 100 * float(completed_questionnaires_g1_r7) / total_started_questionnaires_g1_r7
print "\nCompletion rate of G1 users (with rule 7 applied): " + '% .2f' % completion_rate_g1_r7 + "%"

Number of G1 users (with rule 7 applied): 73
Number of G1 users that have NOT finished the questionnaire (with rule 7 applied): 24
Number of G1 users that have finished the questionnaire (with rule 7 applied): 49

Completion rate of G1 users (with rule 7 applied):  67.12%


In [63]:
total_started_questionnaires_g3_r7 = g3[g3["applied_rule"] == 7].shape[0]
print "Number of G3 users (with rule 7 applied): " + str(total_started_questionnaires_g3_r7)

uncompleted_questionnaires_g3_r7 = g3[(g3["applied_rule"] == 7) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 7 applied): " + str(uncompleted_questionnaires_g3_r7)

completed_questionnaires_g3_r7 = g3[(g3["applied_rule"] == 7) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 7 applied): " + str(completed_questionnaires_g3_r7)

completion_rate_g3_r7 = 100 * float(completed_questionnaires_g3_r7) / total_started_questionnaires_g3_r7
print "\nCompletion rate of G3 users (with rule 7 applied): " + '% .2f' % completion_rate_g3_r7 + "%"

Number of G3 users (with rule 7 applied): 27
Number of G3 users that have NOT finished the questionnaire (with rule 7 applied): 7
Number of G3 users that have finished the questionnaire (with rule 7 applied): 20

Completion rate of G3 users (with rule 7 applied):  74.07%


In [64]:
nFinalized = pd.Series([completed_questionnaires_g1_r7, completed_questionnaires_g3_r7], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r7, uncompleted_questionnaires_g3_r7], name='Not finished')
confMatrixR7 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR7.columns = ['G1_Rule7', 'G3_Rule7']
confMatrixR7

Unnamed: 0,G1_Rule7,G3_Rule7
Finished,49,20
Not finished,24,7


In [65]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR7, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 0.445188282931
Significance 0.504628863756


<h4>Rule number 8 (G1-G3)</h4>

In [66]:
total_started_questionnaires_g1_r8 = g1[g1["applied_rule"] == 8].shape[0]
print "Number of G1 users (with rule 8 applied): " + str(total_started_questionnaires_g1_r8)

uncompleted_questionnaires_g1_r8 = g1[(g1["applied_rule"] == 8) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 8 applied): " + str(uncompleted_questionnaires_g1_r8)

completed_questionnaires_g1_r8 = g1[(g1["applied_rule"] == 8) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 8 applied): " + str(completed_questionnaires_g1_r8)

completion_rate_g1_r8 = 100 * float(completed_questionnaires_g1_r8) / total_started_questionnaires_g1_r8
print "\nCompletion rate of G1 users (with rule 8 applied): " + '% .2f' % completion_rate_g1_r8 + "%"

Number of G1 users (with rule 8 applied): 25
Number of G1 users that have NOT finished the questionnaire (with rule 8 applied): 7
Number of G1 users that have finished the questionnaire (with rule 8 applied): 18

Completion rate of G1 users (with rule 8 applied):  72.00%


In [67]:
total_started_questionnaires_g3_r8 = g3[g3["applied_rule"] == 8].shape[0]
print "Number of G3 users (with rule 8 applied): " + str(total_started_questionnaires_g3_r8)

uncompleted_questionnaires_g3_r8 = g3[(g3["applied_rule"] == 8) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 8 applied): " + str(uncompleted_questionnaires_g3_r8)

completed_questionnaires_g3_r8 = g3[(g3["applied_rule"] == 8) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 8 applied): " + str(completed_questionnaires_g3_r8)

completion_rate_g3_r8 = 100 * float(completed_questionnaires_g3_r8) / total_started_questionnaires_g3_r8
print "\nCompletion rate of G3 users (with rule 8 applied): " + '% .2f' % completion_rate_g3_r8 + "%"

Number of G3 users (with rule 8 applied): 10
Number of G3 users that have NOT finished the questionnaire (with rule 8 applied): 2
Number of G3 users that have finished the questionnaire (with rule 8 applied): 8

Completion rate of G3 users (with rule 8 applied):  80.00%


In [68]:
nFinalized = pd.Series([completed_questionnaires_g1_r8, completed_questionnaires_g3_r8], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r8, uncompleted_questionnaires_g3_r8], name='Not finished')
confMatrixR8 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR8.columns = ['G1_Rule8', 'G3_Rule8']
confMatrixR8

Unnamed: 0,G1_Rule8,G3_Rule8
Finished,18,8
Not finished,7,2


In [69]:
oddsratio, p_value = stats.fisher_exact(confMatrixR8, alternative="less")
print "Odds ratio: " + str(oddsratio)
print "Fisher's exact test significance: " + str(p_value)

Odds ratio: 0.642857142857
Fisher's exact test significance: 0.488478483718


<h4>Rule number 9 (G1-G3)</h4>

In [70]:
total_started_questionnaires_g1_r9 = g1[g1["applied_rule"] == 9].shape[0]
print "Number of G1 users (with rule 9 applied): " + str(total_started_questionnaires_g1_r9)

uncompleted_questionnaires_g1_r9 = g1[(g1["applied_rule"] == 9) & (g1["cuestionarioFinalizado"] == False)].shape[0]
print "Number of G1 users that have NOT finished the questionnaire (with rule 9 applied): " + str(uncompleted_questionnaires_g1_r9)

completed_questionnaires_g1_r9 = g1[(g1["applied_rule"] == 9) & (g1["cuestionarioFinalizado"] == True)].shape[0]
print "Number of G1 users that have finished the questionnaire (with rule 9 applied): " + str(completed_questionnaires_g1_r9)

completion_rate_g1_r9 = 100 * float(completed_questionnaires_g1_r9) / total_started_questionnaires_g1_r9
print "\nCompletion rate of G1 users (with rule 9 applied): " + '% .2f' % completion_rate_g1_r9 + "%"

Number of G1 users (with rule 9 applied): 427
Number of G1 users that have NOT finished the questionnaire (with rule 9 applied): 160
Number of G1 users that have finished the questionnaire (with rule 9 applied): 267

Completion rate of G1 users (with rule 9 applied):  62.53%


In [71]:
total_started_questionnaires_g3_r9 = g3[g3["applied_rule"] == 9].shape[0]
print "Number of G3 users (with rule 9 applied): " + str(total_started_questionnaires_g3_r9)

uncompleted_questionnaires_g3_r9 = g3[(g3["applied_rule"] == 9) & (g3["cuestionarioFinalizado_after_reinf"] == False)].shape[0]
print "Number of G3 users that have NOT finished the questionnaire (with rule 9 applied): " + str(uncompleted_questionnaires_g3_r9)

completed_questionnaires_g3_r9 = g3[(g3["applied_rule"] == 9) & (g3["cuestionarioFinalizado_after_reinf"] == True)].shape[0]
print "Number of G3 users that have finished the questionnaire (with rule 9 applied): " + str(completed_questionnaires_g3_r9)

completion_rate_g3_r9 = 100 * float(completed_questionnaires_g3_r9) / total_started_questionnaires_g3_r9
print "\nCompletion rate of G3 users (with rule 9 applied): " + '% .2f' % completion_rate_g3_r9 + "%"

Number of G3 users (with rule 9 applied): 241
Number of G3 users that have NOT finished the questionnaire (with rule 9 applied): 89
Number of G3 users that have finished the questionnaire (with rule 9 applied): 152

Completion rate of G3 users (with rule 9 applied):  63.07%


In [72]:
nFinalized = pd.Series([completed_questionnaires_g1_r9, completed_questionnaires_g3_r9], name='Finished')
nNotFinalized = pd.Series([uncompleted_questionnaires_g1_r9, uncompleted_questionnaires_g3_r9], name='Not finished')
confMatrixR9 = pd.concat([nFinalized, nNotFinalized], axis=1).T
confMatrixR9.columns = ['G1_Rule9', 'G3_Rule9']
confMatrixR9

Unnamed: 0,G1_Rule9,G3_Rule9
Finished,267,152
Not finished,160,89


In [73]:
chi2, p_value, dof, ex = stats.chi2_contingency(confMatrixR9, correction=False)
print "Pearson's Chi-square: " + str(chi2)
print "Significance " + str(p_value)

Pearson's Chi-square: 0.0193031668756
Significance 0.88950085155
