In [1]:
from scipy import stats
import pandas as pd
import numpy as np

## Train data analysis

In [2]:
df = pd.read_csv('data/new_train_data.csv', header=0)
df.shape

(1020, 59)

In [3]:
print('experts: {}'.format(len(df[df.label == 1].drop_duplicates(['user', 'page_id']))))
print('non-experts: {}'.format(len(df[df.label != 1].drop_duplicates(['user', 'page_id']))))

experts: 506
non-experts: 514


In [4]:
print(df['category'].unique())
df['category'] = df.apply(lambda x: 'Nature' if x.page_id == 509078 else x.category, axis=1)
df['category'] = df.apply(lambda x: 'Mathematics' if x.page_id == 22170 else x.category, axis=1)
df['category'] = df.apply(lambda x: 'Medicine' if x.page_id == 8325231 else x.category, axis=1)
df['category'] = df.apply(lambda x: 'Concepts' if x.page_id == 1453 else x.category, axis=1)

['Education' 'Health' 'Science' 'Medicine' 'Humans' 'Language' 'Concepts'
 'Mathematics' 'Culture' 'Geography' 'Humanities' nan 'People'
 'Agriculture' 'Law' 'Environment' 'Chronology' 'Nature' 'Arts' 'Politics'
 'Life' 'Business' 'Technology' 'Society' 'History']


In [5]:
df.drop(['edit_type_exists'], axis=1, inplace=True)
df.head()

Unnamed: 0,page_id,page,category,user,page_edits,page_edits_ratio,page_talk_edits,edit_period_q1,edit_period_q2,edit_period_q3,...,edit_type_d,edit_type_e,edit_type_f,edit_type_g,edit_type_h,edit_type_i,edit_type_j,edit_type_k,edit_type_l,edit_type_m
0,140968,Rotavirus,Education,Ben Moore,2.0,0.001047,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,140968,Rotavirus,Education,Magnus Manske,1.0,0.000523,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,140968,Rotavirus,Education,Wilke,3.0,0.00157,0.0,1.0,0.0,0.0,...,0.0,0.0,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
3,140968,Rotavirus,Education,PeterJohnson,1.0,0.000523,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,140968,Rotavirus,Education,Mikael Haggstrom,1.0,0.000523,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
edit_types = [col for col in df.columns if str(col).startswith('edit_type')]
print(edit_types)

['edit_type_a', 'edit_type_b', 'edit_type_c', 'edit_type_d', 'edit_type_e', 'edit_type_f', 'edit_type_g', 'edit_type_h', 'edit_type_i', 'edit_type_j', 'edit_type_k', 'edit_type_l', 'edit_type_m']


In [10]:
# df.dropna(subset=edit_types, inplace=True)
# for edit_type in edit_types:
#     df[edit_type].fillna(value=-1, inplace=True)  
# df.fillna(axis=1, inplace=True, value=-1)

In [7]:
category_cols = ['category', 'pages', 'experts', 'non-experts']
category_rows = []
grouped = df.groupby(by='category')
for name, g in grouped:
    n_pages = len(g['page_id'].unique())
    n_experts = len(g[g['label']==1])
    n_non_experts = len(g[g['label']!=1])
    category_rows.append([name, n_pages, n_experts, n_non_experts])

category_df = pd.DataFrame(category_rows, columns=category_cols)
totals = [['Total', np.sum(category_df['pages']), np.sum(category_df['experts']), np.sum(category_df['non-experts'])]]

category_totals_df = pd.DataFrame(totals, columns=category_cols)
category_df = pd.concat([category_df,category_totals_df])
category_df.head(30)
category_df.to_csv(r'data/new_train_data_stats.csv', index=False)

In [8]:
def get_ttest_stats(feature):
    expert_persistence = df[df.label == 1][feature]
    non_expert_persistence = df[df.label == 0][feature]

    ttest_result = stats.ttest_ind(expert_persistence, non_expert_persistence, equal_var = False)
#     print('parameter: %s\tstatistic: %2f\tp-value: %2f' % (feature, ttest_result.statistic, ttest_result.pvalue))  
    return [feature, ttest_result.statistic, ttest_result.pvalue]

In [13]:
df = df[~df.edit_type_a.isnull()]

In [14]:
drop_list = ['page_id', 
            'page', 
            'category', 
            'user', 
            'label']

ndf = df.drop(drop_list, axis=1)

columns = ['feature', 't_statistic', 'p_value']
ttest_rows = []
for col in ndf.columns:
    ttest_rows.append(get_ttest_stats(col))

ttest_df = pd.DataFrame(ttest_rows, columns=columns)
ttest_df.head()
ttest_df.to_csv(r'data/new_features_ttest.csv', index=False)
print(ttest_df)

                         feature  t_statistic       p_value
0                     page_edits     3.452627  6.014966e-04
1               page_edits_ratio     4.600974  5.262375e-06
2                page_talk_edits     2.885589  4.077646e-03
3                 edit_period_q1    -0.863435  3.881092e-01
4                 edit_period_q2     0.605062  5.452779e-01
5                 edit_period_q3    -0.822484  4.110011e-01
6                 edit_period_q4    -1.756166  7.937350e-02
7             mean_edit_interval     0.631569  5.278167e-01
8                 mean_edit_size    -2.267210  2.359464e-02
9                         tenure    -0.046534  9.628944e-01
10                        gender    -2.844648  4.539838e-03
11                 ns0_edit_dist     0.552434  5.807775e-01
12                 ns1_edit_dist     4.580663  5.230167e-06
13                 ns2_edit_dist     2.328700  2.010487e-02
14                 ns3_edit_dist    -7.881006  1.025681e-14
15                 ns4_edit_dist    -0.0

In [15]:
features = {'summary_similarity': 'Wikipedia activity, Main Namespace – summary similarity',
            'links_overlap': 'Wikipedia activity, Main Namespace- hyperlink overlap',
            'page_edits_ratio': 'Focal page activity – ratio of edits out of total edits',
            'title_similarity': 'Wikipedia activity, Main Namespace –title similarity',
            'avg_persistence': 'Focal page activity - average persistence of contributed tokens',
            'edit_type_c': 'Focal page activity - total \'Wiki MarkUp\' edits',
            'page_edits': 'Focal page activity - total edits',
            'edit_type_g': 'Focal page activity - total \'Reorganize Content\' edits',
            'edit_type_b': 'Focal page activity - total \'Add Content\' edits',
            'total_edited_pages': 'Wikipedia activity - total edits in all namespaces',
            'page_edit_dist': 'Wikipedia activity, Main Namespace – entropy of edit quantity',
            'edit_type_j': 'Focal page activity - total \'Fix Vandalism\' edits'}

means_cols = ['Feature', 'SME mean', '"Unknown" mean']
means = []
for key in features.keys():
    descr = features[key]
    sme_mean = df[df.label==1][key].mean()
    unknown_mean = df[df.label!=1][key].mean()
    means.append([descr, sme_mean, unknown_mean])

means_df = pd.DataFrame(means, columns=means_cols)
means_df.to_csv(r'data/new_features_means.csv', index=False)
print(means_df)

                                              Feature      SME mean  \
0   Wikipedia activity, Main Namespace – summary s...  1.430924e-02   
1   Wikipedia activity, Main Namespace- hyperlink ...  1.616010e-02   
2   Focal page activity – ratio of edits out of to...  1.024411e-02   
3   Wikipedia activity, Main Namespace –title simi...  9.155730e-04   
4   Focal page activity - average persistence of c...  1.736422e+07   
5     Focal page activity - total 'Wiki MarkUp' edits  5.938392e-01   
6                   Focal page activity - total edits  1.241189e+01   
7   Focal page activity - total 'Reorganize Conten...  6.842666e-02   
8     Focal page activity - total 'Add Content' edits  3.222774e-01   
9   Wikipedia activity - total edits in all namesp...  1.594016e+03   
10  Wikipedia activity, Main Namespace – entropy o...  5.816257e+00   
11  Focal page activity - total 'Fix Vandalism' edits  1.136749e-01   

    "Unknown" mean  
0     5.805537e-03  
1     6.224730e-03  
2     2.30039

In [16]:
features = df.columns.tolist()
for drop_col in drop_list:
    features.remove(drop_col)
print(features)

['page_edits', 'page_edits_ratio', 'page_talk_edits', 'edit_period_q1', 'edit_period_q2', 'edit_period_q3', 'edit_period_q4', 'mean_edit_interval', 'mean_edit_size', 'tenure', 'gender', 'ns0_edit_dist', 'ns1_edit_dist', 'ns2_edit_dist', 'ns3_edit_dist', 'ns4_edit_dist', 'ns5_edit_dist', 'ns6_edit_dist', 'ns7_edit_dist', 'ns8_edit_dist', 'ns9_edit_dist', 'ns10_edit_dist', 'ns11_edit_dist', 'ns12_edit_dist', 'ns13_edit_dist', 'ns14_edit_dist', 'ns15_edit_dist', 'page_edit_dist', 'total_edited_pages', 'links_overlap', 'categories_overlap', 'title_similarity', 'summary_similarity', 'avg_persistence', 'content_token_count', 'content_token_edit_count_avg', 'content_token_vs_stop_words', 'content_token_vs_token', 'contribution_similarity', 'persistence_exists', 'edit_type_a', 'edit_type_b', 'edit_type_c', 'edit_type_d', 'edit_type_e', 'edit_type_f', 'edit_type_g', 'edit_type_h', 'edit_type_i', 'edit_type_j', 'edit_type_k', 'edit_type_l', 'edit_type_m']


In [17]:
sme = df[df.label==1][features].describe().T
unknown = df[df.label!=1][features].describe().T

sme.rename(columns={col: 'sme_' + col for col in sme.columns}, inplace=True)
unknown.rename(columns={col: 'unknown_' + col for col in unknown.columns}, inplace=True)

sme.reset_index(inplace=True)
unknown.reset_index(inplace=True)

result = sme.merge(unknown, how='inner', on='index')
result.rename(columns={'index': 'feature'}, inplace=True)
result.set_index(keys=['feature'], inplace=True)
print(result.head())

result.to_csv(r'data/new_features_stats.csv', index=True)

                  sme_count   sme_mean    sme_std   sme_min   sme_25%  \
feature                                                                 
page_edits            488.0  12.411885  58.154757  1.000000  1.000000   
page_edits_ratio      488.0   0.010244   0.037284  0.000057  0.000455   
page_talk_edits       488.0   3.350410  22.363209  0.000000  0.000000   
edit_period_q1        488.0   0.139344   0.346661  0.000000  0.000000   
edit_period_q2        488.0   0.319672   0.466828  0.000000  0.000000   

                   sme_50%   sme_75%     sme_max  unknown_count  unknown_mean  \
feature                                                                         
page_edits        1.000000  4.000000  796.000000          497.0      3.225352   
page_edits_ratio  0.001033  0.004569    0.483871          497.0      0.002300   
page_talk_edits   0.000000  0.000000  393.000000          497.0      0.418511   
edit_period_q1    0.000000  0.000000    1.000000          497.0      0.158954   
ed