In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from pprint import pprint

In [2]:
df = pd.read_hdf(r'../../data/sme/features.h5')
print(f'Total pages: {len(df.page_id.unique())}')
print(f'Total users: {len(df.user_name.unique())}')
total_pairs = len(df[['page_id', 'user_name']])
print(f'Total pairs (page-user): {total_pairs}')
df.head()

Total pages: 982
Total users: 305574
Total pairs (page-user): 457840


Unnamed: 0,page_id,user_name,page_edits,page_edits_ratio,talk_page_edits,talk_page_edits_ratio,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,...,content_token_count,content_token_edit_count_avg,content_token_vs_stop_words,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity
0,12,,16,0.000907,2.0,0.00011,0.125,0.8125,0.0,0.0625,...,,,,,,,,,,
1,12,...---...SOS,1,5.7e-05,,,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.048956,0.004378,0.0
2,12,0,1,5.7e-05,,,1.0,0.0,0.0,0.0,...,5.0,5.0,1.666667,0.151515,1.0,0.0,0.0,0.031913,0.003536,0.0
3,12,1.36.116.199,1,5.7e-05,,,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.002554,0.038611,0.022629,0.0
4,12,100110100,6,0.00034,,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.054761,0.003947,0.0


In [3]:
# fig = plt.figure()
# fig.set_figheight(4)
# fig.set_figwidth(8)

# ax = sdf.page_edits.plot.hist(grid=True, bins=[-.3,-.2,-.1, .1, .2, .3, .4], rwidth=0.9, color='#607c8e', density=True)
# plt.title('Page edits distribution')
# plt.xlabel('Edits per page')
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)

In [4]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

scale_columns = ['page_edits',
                 'talk_page_edits',
                 'mean_edit_frequency',
                 'median_edit_frequency',
                 'mean_edit_size',
                 'median_edit_size',
                 'total_edited_pages',
                 'page_edit_dist',
                 'avg_persistence',
                 'content_token_count',
                 'content_token_edit_count_avg',
                 'content_token_vs_stop_words',
                 'content_token_vs_token',
                 'title_similarity',
                 'summary_similarity']

X = df[scale_columns]
transformer = MaxAbsScaler().fit(X)
scaled_array = transformer.transform(X)
scaled_df = pd.DataFrame(data=scaled_array, columns=X.columns)

In [5]:
df[X.columns] = scaled_df[X.columns]
df.rename(columns={'talk_page_edits': 'page_talk_edits', 'user_name': 'user'}, inplace=True)
df.fillna(value=-1, inplace=True)
df.head()

Unnamed: 0,page_id,user,page_edits,page_edits_ratio,page_talk_edits,talk_page_edits_ratio,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,...,content_token_count,content_token_edit_count_avg,content_token_vs_stop_words,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity
0,12,-1,0.010363,0.000907,0.000833,0.00011,0.125,0.8125,0.0,0.0625,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,12,...---...SOS,0.000648,5.7e-05,-1.0,-1.0,0.0,1.0,0.0,0.0,...,2e-06,5e-06,0.0,0.001221,1.0,0.0,0.0,0.048956,0.004378,0.0
2,12,0,0.000648,5.7e-05,-1.0,-1.0,1.0,0.0,0.0,0.0,...,1.2e-05,2.7e-05,9e-06,0.000555,1.0,0.0,0.0,0.031913,0.003536,0.0
3,12,1.36.116.199,0.000648,5.7e-05,-1.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.00037,0.038611,0.022629,0.0
4,12,100110100,0.003886,0.00034,-1.0,-1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.054761,0.003947,0.0


In [6]:
train_df = pd.read_csv('data/new_train_data.csv', header=0)
print(train_df.shape)
print('experts: {}'.format(len(train_df[train_df.label == 1])))
print('non-experts: {}'.format(len(train_df[train_df.label != 1])))

(1020, 59)
experts: 506
non-experts: 514


In [7]:
ndf = pd.merge(left=df, 
               right=train_df[['page_id', 'user', 'label']], 
               how='left', 
               on=['page_id', 'user'],
               left_index=False,
               right_index=False)
ndf.drop(columns=['median_edit_frequency', 'talk_page_edits_ratio'], inplace=True)

info_df = pd.read_excel(r'data/1000.xlsx')
info_df.rename(columns={'page_title': 'page', 'page_category': 'category'}, inplace=True)
ndf = pd.merge(left=ndf, 
               right=info_df, 
               how='left', 
               on=['page_id'],
               left_index=False,
               right_index=False)

ndf[ndf.label == 0].shape
new_train_df = ndf[~pd.isnull(ndf.label)]
print(new_train_df.shape)
new_test_df = ndf[pd.isnull(ndf.label)]
print(new_test_df.shape)
ndf.head()

(985, 59)
(456855, 59)


Unnamed: 0,page_id,user,page_edits,page_edits_ratio,page_talk_edits,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,mean_edit_frequency,...,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity,label,page,category
0,12,-1,0.010363,0.000907,0.000833,0.125,0.8125,0.0,0.0625,0.0905746,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,Anarchism,Culture
1,12,...---...SOS,0.000648,5.7e-05,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.001221,1.0,0.0,0.0,0.048956,0.004378,0.0,,Anarchism,Culture
2,12,0,0.000648,5.7e-05,-1.0,1.0,0.0,0.0,0.0,0.0,...,0.000555,1.0,0.0,0.0,0.031913,0.003536,0.0,,Anarchism,Culture
3,12,1.36.116.199,0.000648,5.7e-05,-1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.00037,0.038611,0.022629,0.0,,Anarchism,Culture
4,12,100110100,0.003886,0.00034,-1.0,0.0,1.0,0.0,0.0,4.2697e-07,...,0.0,1.0,0.0,0.0,0.054761,0.003947,0.0,,Anarchism,Culture


In [8]:
new_train_df.to_csv(r'data/new_train_1000.csv', index=False)
new_test_df.to_csv(r'data/new_test_1000.csv', index=False)