In [1]:
import pandas as pd
from scipy import stats

In [2]:
df = pd.read_csv('data/train.csv', header=0)
df.shape

(14575, 59)

In [3]:
df.drop(columns=['persistence_exists'], inplace=True)

In [4]:
print('experts: {}'.format(len(df[df.label == 1].drop_duplicates(['user', 'page_id']))))
print('non-experts: {}'.format(len(df[df.label != 1].drop_duplicates(['user', 'page_id']))))

experts: 625
non-experts: 13950


In [5]:
print(df['category'].unique())

['Culture' 'Language' 'Environment' 'Concepts' 'Humanities' 'Society'
 'Science' 'Agriculture' 'Nature' 'Arts' 'Life' 'Mathematics' 'Law'
 'Education' 'Geography' 'History' 'Business' 'Technology' 'Health'
 'Politics' 'Medicine' 'People' 'Humans' 'Sports' 'Chronology']


In [6]:
def get_ttest_stats(feature):
    expert_persistence = df[df.label == 1][feature]
    non_expert_persistence = df[df.label == 0][feature]

    ttest_result = stats.ttest_ind(expert_persistence, non_expert_persistence, equal_var = False)
#     print('parameter: %s\tstatistic: %2f\tp-value: %2f' % (feature, ttest_result.statistic, ttest_result.pvalue))  
    return [feature, ttest_result.statistic, ttest_result.pvalue]

In [7]:
drop_list = ['page_id', 
            'page', 
            'category', 
            'user', 
            'label']

ndf = df.drop(drop_list, axis=1)

columns = ['feature', 't_statistic', 'p_value']
ttest_rows = []
for col in ndf.columns:
    ttest_rows.append(get_ttest_stats(col))

ttest_df = pd.DataFrame(ttest_rows, columns=columns)
ttest_df.head()
ttest_df.to_csv(r'data/features_ttest.csv', index=False)
print(ttest_df)

                         feature  t_statistic       p_value
0                     page_edits     3.978839  7.733600e-05
1               page_edits_ratio     5.538000  4.487157e-08
2                page_talk_edits     8.148673  1.892507e-15
3                 edit_period_q1     1.245976  2.132032e-01
4                 edit_period_q2    -2.626051  8.830619e-03
5                 edit_period_q3     1.706349  8.839896e-02
6                 edit_period_q4     0.071068  9.433644e-01
7            mean_edit_frequency     3.410096  6.887815e-04
8                 mean_edit_size     0.467861  6.400373e-01
9               median_edit_size     0.371045  7.107217e-01
10                   edit_type_a     0.299517  7.646361e-01
11                   edit_type_b     4.339038  1.647369e-05
12                   edit_type_c     8.746750  1.637963e-17
13                   edit_type_d     1.289729  1.976046e-01
14                   edit_type_e     1.009700  3.129972e-01
15                   edit_type_f    -0.6

In [8]:
features = {'summary_similarity': 'Wikipedia activity, Main Namespace – summary similarity',
            'links_overlap': 'Wikipedia activity, Main Namespace- hyperlink overlap',
            'page_edits_ratio': 'Focal page activity – ratio of edits out of total edits',
            'title_similarity': 'Wikipedia activity, Main Namespace –title similarity',
            'avg_persistence': 'Focal page activity - average persistence of contributed tokens',
            'edit_type_c': 'Focal page activity - total \'Wiki MarkUp\' edits',
            'page_edits': 'Focal page activity - total edits',
            'edit_type_g': 'Focal page activity - total \'Reorganize Content\' edits',
            'edit_type_b': 'Focal page activity - total \'Add Content\' edits',
            'total_edited_pages': 'Wikipedia activity - total edits in all namespaces',
            'page_edit_dist': 'Wikipedia activity, Main Namespace – entropy of edit quantity',
            'edit_type_j': 'Focal page activity - total \'Fix Vandalism\' edits'}

means_cols = ['Feature', 'SME mean', '"Unknown" mean']
means = []
for key in features.keys():
    descr = features[key]
    sme_mean = df[df.label==1][key].mean()
    unknown_mean = df[df.label!=1][key].mean()
    means.append([descr, sme_mean, unknown_mean])

means_df = pd.DataFrame(means, columns=means_cols)
means_df.to_csv(r'data/features_means.csv', index=False)
print(means_df)

                                              Feature  SME mean  \
0   Wikipedia activity, Main Namespace – summary s... -0.005774   
1   Wikipedia activity, Main Namespace- hyperlink ...  0.017961   
2   Focal page activity – ratio of edits out of to...  0.011206   
3   Wikipedia activity, Main Namespace –title simi... -0.005924   
4   Focal page activity - average persistence of c...  0.048998   
5     Focal page activity - total 'Wiki MarkUp' edits  0.588611   
6                   Focal page activity - total edits  0.007676   
7   Focal page activity - total 'Reorganize Conten...  0.075640   
8     Focal page activity - total 'Add Content' edits  0.319961   
9   Wikipedia activity - total edits in all namesp... -0.035327   
10  Wikipedia activity, Main Namespace – entropy o...  0.435322   
11  Focal page activity - total 'Fix Vandalism' edits  0.116221   

    "Unknown" mean  
0        -0.005519  
1         0.013212  
2         0.002992  
3        -0.005770  
4         0.025314  
5 

In [31]:
from statsmodels.sandbox.stats.multicomp import multipletests
pvals = ttest_df['p_value']
p_adjusted = multipletests(pvals, alpha=0.05, method='bonferroni')
print(p_adjusted[0])
print(p_adjusted[1])

[ True  True  True False False False False  True False False False  True
  True False False False  True False  True  True False False False  True
 False  True  True  True  True False  True  True False  True  True False
  True  True False False False  True  True  True False False False False
 False False  True  True  True]
[4.09880787e-03 2.37819337e-06 1.00302857e-13 1.00000000e+00
 4.68022809e-01 1.00000000e+00 1.00000000e+00 3.65054172e-02
 1.00000000e+00 1.00000000e+00 1.00000000e+00 8.73105564e-04
 8.68120451e-16 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.12584413e-02 1.00000000e+00 2.87640331e-08 1.57568811e-24
 1.00000000e+00 1.00000000e+00 1.00000000e+00 3.74793267e-03
 1.00000000e+00 5.44511953e-12 1.72949790e-22 2.93711318e-04
 5.25130288e-41 6.09599628e-02 4.23449682e-21 7.31168474e-04
 5.52780118e-02 2.04092097e-31 2.26939319e-05 1.00000000e+00
 2.06503005e-02 2.14299613e-02 1.00000000e+00 1.00000000e+00
 1.00000000e+00 3.04955747e-06 8.60871381e-03 7.57411360e-08
 1.32

In [18]:
bonferonni_df = ttest_df.copy()
bonferonni_df.rename(columns={'t_statistic': 'ttest_t_statistic', 'p_value': 'ttest_p_value'}, inplace=True)
bonferonni_df.head()

Unnamed: 0,feature,ttest_t_statistic,ttest_p_value
0,page_edits,3.978839,7.7336e-05
1,page_edits_ratio,5.538,4.487157e-08
2,page_talk_edits,8.148673,1.892507e-15
3,edit_period_q1,1.245976,0.2132032
4,edit_period_q2,-2.626051,0.008830619


In [22]:
bonferonni_df['bonf_result'] = p_adjusted[0]
bonferonni_df['bonf_p_value'] = p_adjusted[1]
bonferonni_df.to_csv(r'data/features_bonf.csv', index=False)
bonferonni_df

Unnamed: 0,feature,ttest_t_statistic,ttest_p_value,bonf_result,bonf_p_value
0,page_edits,3.978839,7.7336e-05,True,0.004098808
1,page_edits_ratio,5.538,4.487157e-08,True,2.378193e-06
2,page_talk_edits,8.148673,1.892507e-15,True,1.003029e-13
3,edit_period_q1,1.245976,0.2132032,False,1.0
4,edit_period_q2,-2.626051,0.008830619,False,0.4680228
5,edit_period_q3,1.706349,0.08839896,False,1.0
6,edit_period_q4,0.071068,0.9433644,False,1.0
7,mean_edit_frequency,3.410096,0.0006887815,True,0.03650542
8,mean_edit_size,0.467861,0.6400373,False,1.0
9,median_edit_size,0.371045,0.7107217,False,1.0


In [30]:
bonferonni_df[(bonferonni_df['ttest_p_value'] < 0.05) & (~bonferonni_df['bonf_result'])]

Unnamed: 0,feature,ttest_t_statistic,ttest_p_value,bonf_result,bonf_p_value
4,edit_period_q2,-2.626051,0.008831,False,0.468023
24,gender,2.34532,0.019305,False,1.0
29,ns4_edit_dist,3.265355,0.00115,False,0.06096
32,ns7_edit_dist,-3.285489,0.001043,False,0.055278
44,content_token_count,3.034981,0.002505,False,0.132775
47,content_token_vs_token,2.102447,0.035868,False,1.0
