# predict hoax (pairs kept in same groups)

## exploration controls

In [26]:
random_seed = 0

## get data

### imports

In [28]:
import os, re, patsy
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
path = '/home/bhrdwj/git/predwikt/data/raw/wiki_reliability/unzipped/'

In [2]:
fea = (pd.read_csv(path+'hoax_features.csv', usecols=lambda x: x not in ['Unnamed: 0'])
       .rename(columns={'headings_by_level(2)':'headings_by_level_2', 'revision_id.key':'revision_id_key'}))

### train test split

#### Make series of negative revisions and their revision keys, and vice versa

In [3]:
neg_revs = fea[['revision_id', 'revision_id_key', 'has_template']]
neg_revs = neg_revs.loc[neg_revs.has_template==0].set_index('revision_id')['revision_id_key']
pos_revs = fea[['revision_id', 'revision_id_key', 'has_template']]
pos_revs = pos_revs.loc[pos_revs.has_template==1].set_index('revision_id')['revision_id_key']

neg_revs.shape #, pos_revs.shape

(1390,)

#### Test-train split the neg_revs, and form dfte and dftr

In [4]:
neg_revs_tr, neg_revs_te = train_test_split(neg_revs, test_size=.2, random_state=0)
pos_revs_tr = pos_revs[neg_revs_tr.values]
pos_revs_te = pos_revs[neg_revs_te.values]

In [5]:
revs_tr = pd.concat((neg_revs_tr, pos_revs_tr))
revs_te = pd.concat((neg_revs_te, pos_revs_te))

In [6]:
fea_rev = fea.set_index('revision_id')
dftr = fea_rev.loc[revs_tr.index].dropna()
dfte = fea_rev.loc[revs_te.index].dropna()

In [7]:
del neg_revs, pos_revs, neg_revs_tr, pos_revs_tr, neg_revs_te, pos_revs_te, revs_tr, revs_te, fea_rev

In [8]:
dftr[dftr.columns.difference(['page_id','revision_id_key','has_template'])].describe().T.sort_values(by='mean');

### prep

In [27]:
# remove non-features; dummify categoricals
ytr = dftr.has_template
Xtr = dftr[dftr.columns.difference(['page_id','revision_id_key','has_template'])]
Xtr = patsy.dmatrix('~ '+' + '.join(Xtr.columns), data=Xtr, NA_action='drop', return_type='dataframe')

yte = dfte.has_template
Xte = dfte[dfte.columns.difference(['page_id','revision_id_key','has_template'])]
Xte = patsy.dmatrix('~ '+' + '.join(Xte.columns), data=Xte, NA_action='drop', return_type='dataframe')

# make complete list of columns in case the test set doesn't include any of a rare class
Xcols = list(
    set(Xtr.columns.tolist())
    .union(set(Xte.columns.tolist()))
)

for col in Xcols:
    if col not in Xte:
        Xte[col] = 0
    if col not in Xtr:
        Xtr[col] = 0

### to scale or not to scale

In [13]:
# scaler = StandardScaler()
# def scale_reindex(df):
#     matrix = scaler.fit_transform(df)
#     return pd.DataFrame(data=matrix, index=df.index, columns=df.columns)

# [Xtr,Xte] = map(scale_reindex, [Xtr,Xte])

## run model

### feature selection (keep all features)

In [25]:
feats1 = Xtr.columns
feats1 = [i for i in feats1.tolist() if not i.find('page_id') > -1]

### initialize and fit

In [16]:
# Cs = sorted(list(np.logspace(-7,7,num=10)) + [2,4,6,8,10,12,14,16])

In [69]:
lr = LogisticRegression(penalty='l2', C=.1, max_iter=10000, fit_intercept=False)
lr.fit(Xtr[feats1], ytr)  # 

LogisticRegression(C=0.1, fit_intercept=False, max_iter=10000)

#### check results

In [70]:
print(f'Training accuracy {lr.score(Xtr, ytr)}')
print(f'Testing accuracy {lr.score(Xte[feats1], yte)}')
print(f'Baseline accuracy {yte.mean()}')

Training accuracy 0.6226924808644755
Testing accuracy 0.6241007194244604
Baseline accuracy 0.5


#### review fitted coefficients

In [66]:
# feats_from_unscaled
(coeffs * Xtr[feats1].std()).sort_values(ascending=False, key=abs)

stems_length                     -8.507907e-01
revision_content_chars            6.645483e-01
revision_words                   -3.587407e-01
revision_templates                2.976267e-01
words_to_watch_matches            2.669328e-01
external_links                   -2.473405e-01
revision_chars                    1.695734e-01
revision_text_bytes               1.651557e-01
paragraphs_without_refs           1.188092e-01
cite_templates                    8.367468e-02
revision_wikilinks               -3.812286e-02
headings_by_level_2              -3.580977e-02
ref_tags                          3.250125e-02
cn_templates                      1.288976e-02
images_in_tags                    2.677053e-03
category_links                    1.898543e-03
article_quality_score[T.Start]   -1.261452e-03
article_quality_score[T.Stub]    -1.046499e-03
infobox_templates                 6.902527e-04
shortened_footnote_templates     -3.972120e-04
article_quality_score[T.C]        3.911911e-04
who_templates