In [1]:
import pandas as pd
from scipy import stats

In [2]:
# read json file
data = pd.read_json('searches.json', orient='records', lines=True)
data.dtypes

uid              int64
is_instructor     bool
login_count      int64
search_count     int64
dtype: object

In [3]:
# seperate two search features
old_search = data[data['uid'] % 2 == 0]
new_search = data[data['uid'] % 2 == 1]

In [4]:
old_search

Unnamed: 0,uid,is_instructor,login_count,search_count
3,9106912,True,3,0
5,5333200,False,1,0
11,52868,False,2,4
15,2815244,False,4,0
17,5651994,False,1,0
...,...,...,...,...
665,14273796,True,12,0
667,6448802,True,1,1
675,16412676,True,4,0
676,16768212,False,2,0


In [5]:
new_search

Unnamed: 0,uid,is_instructor,login_count,search_count
0,6061521,True,1,2
1,11986457,False,4,0
2,15995765,False,1,0
4,9882383,False,1,0
6,3583107,False,2,0
...,...,...,...,...
673,2849545,False,4,0
674,6810415,False,2,0
677,7643715,True,1,0
678,14838641,False,1,0


In [6]:
# do a non-paramatic test: to check if more people search with the new design
more_users = stats.mannwhitneyu(new_search['login_count'], old_search['login_count'])
more_users

MannwhitneyuResult(statistic=60991.0, pvalue=0.21486368870295647)

In [7]:
# seperate zero and non-zero search counts from two search (old and new) features
old_zeros = old_search[old_search['search_count'] == 0]['uid'].count()
old_nonzs = old_search[old_search['search_count'] > 0]['uid'].count()
new_zeros = new_search[new_search['search_count'] == 0]['uid'].count()
new_nonzs = new_search[new_search['search_count'] > 0]['uid'].count()

In [8]:
# create contingency table 
contingency = [[old_zeros, old_nonzs],
               [new_zeros, new_nonzs]]
contingency

[[222, 111], [250, 98]]

In [9]:
# do a test: to check if people more search with the new design
chi2, more_searches_p, dof, expected = stats.chi2_contingency(contingency)
more_searches_p

0.1676297094499566

In [10]:
# seperate instructors 
old_inst = old_search[old_search['is_instructor'] == True]
new_inst = new_search[new_search['is_instructor'] == True]

In [11]:
old_inst

Unnamed: 0,uid,is_instructor,login_count,search_count
3,9106912,True,3,0
21,3378724,True,1,0
26,8905420,True,4,0
33,13479688,True,3,0
34,13982326,True,1,0
...,...,...,...,...
661,6580404,True,3,3
662,1108670,True,2,9
665,14273796,True,12,0
667,6448802,True,1,1


In [12]:
new_inst

Unnamed: 0,uid,is_instructor,login_count,search_count
0,6061521,True,1,2
7,11760157,True,1,0
13,12986377,True,14,0
14,9792541,True,1,0
19,14915463,True,7,0
...,...,...,...,...
637,3013413,True,5,0
640,16619457,True,1,0
668,8698521,True,3,0
671,6954951,True,2,0


In [13]:
# do a non-paramatic test: to check if more instructors search with the new design
more_instr = stats.mannwhitneyu(new_inst['login_count'], old_inst['login_count'])
more_instr

MannwhitneyuResult(statistic=6964.0, pvalue=0.8989465705117781)

In [14]:
# seperate zero and non-zero instructors search
old_inst_zeros = old_inst[old_inst['search_count'] == 0]['uid'].count()
old_inst_nonzs = old_inst[old_inst['search_count'] > 0]['uid'].count()
new_inst_zeros = new_inst[new_inst['search_count'] == 0]['uid'].count()
new_inst_nonzs = new_inst[new_inst['search_count'] > 0]['uid'].count()

In [15]:
# create contingency table for instructors
contingency_inst = [[old_inst_zeros, old_inst_nonzs],
               [new_inst_zeros, new_inst_nonzs]]
contingency_inst

[[70, 50], [82, 33]]

In [16]:
# do a test: to check if instructors made more search with the new design
chi2_inst, more_instr_searches_p, dof_inst, expected_inst = stats.chi2_contingency(contingency_inst)
more_instr_searches_p

0.052001632770999166