In [1]:
import os
import pandas as pd

### Features

In [2]:
features_df = pd.read_hdf(os.path.join('data', 'features.h5'))
print(f'Total pages: {len(features_df.page_id.unique())}')
print(f'Total users: {len(features_df.user_name.unique())}')
total_pairs = len(features_df[['page_id', 'user_name']])
print(f'Total pairs (page-user): {total_pairs}')
features_df.head()

Total pages: 982
Total users: 305574
Total pairs (page-user): 457840


Unnamed: 0,page_id,user_name,page_edits,page_edits_ratio,talk_page_edits,talk_page_edits_ratio,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,...,content_token_count,content_token_edit_count_avg,content_token_vs_stop_words,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity
0,12,,16,0.000907,2.0,0.00011,0.125,0.8125,0.0,0.0625,...,,,,,,,,,,
1,12,...---...SOS,1,5.7e-05,,,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.048956,0.004378,0.0
2,12,0,1,5.7e-05,,,1.0,0.0,0.0,0.0,...,5.0,5.0,1.666667,0.151515,1.0,0.0,0.0,0.031913,0.003536,0.0
3,12,1.36.116.199,1,5.7e-05,,,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.002554,0.038611,0.022629,0.0
4,12,100110100,6,0.00034,,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.054761,0.003947,0.0


In [3]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

scale_columns = ['page_edits',
                 'talk_page_edits',
                 'mean_edit_frequency',
                 'median_edit_frequency',
                 'mean_edit_size',
                 'median_edit_size',
                 'total_edited_pages',
                 'page_edit_dist',
                 'avg_persistence',
                 'content_token_count',
                 'content_token_edit_count_avg',
                 'content_token_vs_stop_words',
                 'content_token_vs_token',
                 'title_similarity',
                 'summary_similarity']

X = features_df[scale_columns]
transformer = MaxAbsScaler().fit(X)
scaled_array = transformer.transform(X)
scaled_df = pd.DataFrame(data=scaled_array, columns=X.columns)

features_df[X.columns] = scaled_df[X.columns]
features_df.rename(columns={'talk_page_edits': 'page_talk_edits', 'user_name': 'user'}, inplace=True)
features_df.fillna(value=-1, inplace=True)
features_df.head()

Unnamed: 0,page_id,user,page_edits,page_edits_ratio,page_talk_edits,talk_page_edits_ratio,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,...,content_token_count,content_token_edit_count_avg,content_token_vs_stop_words,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity
0,12,-1,0.010363,0.000907,0.000833,0.00011,0.125,0.8125,0.0,0.0625,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,12,...---...SOS,0.000648,5.7e-05,-1.0,-1.0,0.0,1.0,0.0,0.0,...,2e-06,5e-06,0.0,0.001221,1.0,0.0,0.0,0.048956,0.004378,0.0
2,12,0,0.000648,5.7e-05,-1.0,-1.0,1.0,0.0,0.0,0.0,...,1.2e-05,2.7e-05,9e-06,0.000555,1.0,0.0,0.0,0.031913,0.003536,0.0
3,12,1.36.116.199,0.000648,5.7e-05,-1.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.00037,0.038611,0.022629,0.0
4,12,100110100,0.003886,0.00034,-1.0,-1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.054761,0.003947,0.0


### Labels

In [4]:
labels_df = pd.read_excel(os.path.join('data', 'labels.xlsx'))
labels_df = labels_df[~labels_df.label.isnull()]
print(f'total labelled pairs:  {labels_df.shape}')
labels_df.rename(columns={'article_title': 'page', 'user_name': 'user'}, inplace=True)
labels_df.head()

total labelled pairs:  (14677, 13)


Unnamed: 0,page,article_total_edits,user,user_edits_count,matched_keywords,keywords_score,wiki_category,scholar_category,expertise_recommendation,scholar_profile_name,scholar_profile_total_cites,scholar_profile_total_esa,label
116,Agriculture,6316,David D.,1,expert;research;paper;professor,190,Candidate,NameMismatch,Candidate,,0,0.0,0.0
130,Agriculture,6316,Ian.thomson,1,,0,NoKeywords,Candidate,Candidate,Ian Thomson,431,0.005107,0.0
190,Agriculture,6316,Stephen G. Brown,1,,0,NoKeywords,Candidate,Candidate,Steven G Brown,205,0.007873,0.0
263,Agriculture,6316,WQUlrich,1,,0,NoKeywords,Candidate,Candidate,Werner Ulrich,1394,0.015095,0.0
364,Agriculture,6316,Atrian,1,,0,NoKeywords,Candidate,Candidate,Amir Atrian,10,0.005207,0.0


### Articles

In [5]:
articles_df = pd.read_excel(os.path.join('data', '1000.xlsx'), dtypes={'page_id', int})
print(f'articles: {articles_df.shape}')
articles_df.rename(columns={'page_title': 'page', 'page_category': 'category'}, inplace=True)
articles_df.head()

articles: (1000, 3)


Unnamed: 0,page_id,page,category
0,18056921,BÇŽtuta NeagrÄƒ,Agriculture
1,18077446,Havatzelet HaSharon,Agriculture
2,18028108,Burgata,Agriculture
3,18069865,Uvaria chamae,Agriculture
4,18076465,Givat Shapira,Agriculture


In [6]:
df = pd.merge(left=labels_df, right=articles_df[['page_id', 'page', 'category']], how='inner', on=['page'], left_index=False, right_index=False)
print(f'labels: {df.shape}')
df.head()

labels: (14669, 15)


Unnamed: 0,page,article_total_edits,user,user_edits_count,matched_keywords,keywords_score,wiki_category,scholar_category,expertise_recommendation,scholar_profile_name,scholar_profile_total_cites,scholar_profile_total_esa,label,page_id,category
0,Agriculture,6316,David D.,1,expert;research;paper;professor,190,Candidate,NameMismatch,Candidate,,0,0.0,0.0,627,Agriculture
1,Agriculture,6316,Ian.thomson,1,,0,NoKeywords,Candidate,Candidate,Ian Thomson,431,0.005107,0.0,627,Agriculture
2,Agriculture,6316,Stephen G. Brown,1,,0,NoKeywords,Candidate,Candidate,Steven G Brown,205,0.007873,0.0,627,Agriculture
3,Agriculture,6316,WQUlrich,1,,0,NoKeywords,Candidate,Candidate,Werner Ulrich,1394,0.015095,0.0,627,Agriculture
4,Agriculture,6316,Atrian,1,,0,NoKeywords,Candidate,Candidate,Amir Atrian,10,0.005207,0.0,627,Agriculture


In [7]:
# Filtering non-smes with expertise terms
df = df[(df.label==1) | (df.label!=1 & df.matched_keywords.isnull())]

In [10]:
train_df = pd.merge(left=features_df, 
                    right=df[['page_id', 'page', 'category', 'user', 'label']], 
                    how='inner', 
                    on=['page_id', 'user'],
                    left_index=False,
                    right_index=False)
train_df.drop(columns=['median_edit_frequency', 'talk_page_edits_ratio'], inplace=True)
print(f'final: {train_df.shape}')
train_df.head()

final: (10947, 59)


Unnamed: 0,page_id,user,page_edits,page_edits_ratio,page_talk_edits,edit_period_q1,edit_period_q2,edit_period_q3,edit_period_q4,mean_edit_frequency,...,content_token_vs_token,persistence_exists,title_similarity,summary_similarity,categories_overlap,links_overlap,contribution_similarity,page,category,label
0,12,Angelicapple,0.000648,5.7e-05,-1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.01797,0.002113,0.0,Anarchism,Culture,1.0
1,12,JJARichardson,0.001295,0.000113,0.001666,0.0,1.0,0.0,0.0,0.010933,...,0.003058,1.0,0.0,0.000392,0.051216,0.028896,0.0,Anarchism,Culture,1.0
2,12,Kalogeropoulos,0.000648,5.7e-05,-1.0,0.0,1.0,0.0,0.0,0.0,...,0.001221,1.0,0.0,1.4e-05,0.014472,0.001597,0.0,Anarchism,Culture,1.0
3,25,Alex.tan,0.000648,0.000102,-1.0,1.0,0.0,0.0,0.0,0.0,...,0.021978,1.0,0.0,0.000129,0.070786,0.012046,0.0,Autism,Language,1.0
4,25,Centerforautism,0.002591,0.000408,-1.0,0.0,1.0,0.0,0.0,0.002579,...,0.004426,1.0,0.0,0.006294,0.0,0.199588,0.0,Autism,Language,1.0


In [11]:
train_df.to_csv(os.path.join('data', 'train_filtered.csv'), index=False)