## get data

### imports

In [1]:
import os, re, patsy
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
path = '/home/bhrdwj/git/predwikt/data/raw/wiki_reliability/unzipped/'

In [2]:
fea = (pd.read_csv(path+'hoax_features.csv', usecols=lambda x: x not in ['Unnamed: 0'])
       .rename(columns={'headings_by_level(2)':'headings_by_level_2', 'revision_id.key':'revision_id_key'}))

### train test split

#### Make series of negative revisions and their revision keys, and vice versa

In [3]:
neg_revs = fea[['revision_id', 'revision_id_key', 'has_template']]
neg_revs = neg_revs.loc[neg_revs.has_template==0].set_index('revision_id')['revision_id_key']
pos_revs = fea[['revision_id', 'revision_id_key', 'has_template']]
pos_revs = pos_revs.loc[pos_revs.has_template==1].set_index('revision_id')['revision_id_key']

neg_revs.shape #, pos_revs.shape

(1390,)

#### Test-train split the neg_revs, and form dfte and dftr

In [4]:
neg_revs_tr, neg_revs_te = train_test_split(neg_revs, test_size=.2, random_state=1)
pos_revs_tr = pos_revs[neg_revs_tr.values]
pos_revs_te = pos_revs[neg_revs_te.values]

In [5]:
revs_tr = pd.concat((neg_revs_tr, pos_revs_tr))
revs_te = pd.concat((neg_revs_te, pos_revs_te))

In [6]:
fea_rev = fea.set_index('revision_id')
dftr = fea_rev.loc[revs_tr.index].dropna()
dfte = fea_rev.loc[revs_te.index].dropna()

In [7]:
del neg_revs, pos_revs, neg_revs_tr, pos_revs_tr, neg_revs_te, pos_revs_te, revs_tr, revs_te, fea_rev

### prep

In [8]:
ytr = dftr.has_template
Xtr = dftr[dftr.columns.difference(['page_id','revision_id.key','has_template'])]
Xtr = patsy.dmatrix('~ '+' + '.join(Xtr.columns), data=Xtr, NA_action='drop', return_type='dataframe')

yte = dfte.has_template
Xte = dfte[dfte.columns.difference(['page_id','revision_id.key','has_template'])]
Xte = patsy.dmatrix('~ '+' + '.join(Xte.columns), data=Xte, NA_action='drop', return_type='dataframe')

Xcols = list(
    set(Xtr.columns.tolist())
    .union(set(Xte.columns.tolist()))
)

for col in Xcols:
    if col not in Xte:
        Xte[col] = 0
    if col not in Xtr:
        Xtr[col] = 0


In [9]:
ytr.shape, Xtr.shape

((2221,), (2221, 26))

In [10]:
yte.shape, Xte.shape

((556,), (556, 26))

In [11]:
# scaler = StandardScaler()
# def scale_reindex(df):
#     matrix = scaler.fit_transform(df)
#     return pd.DataFrame(data=matrix, index=df.index, columns=df.columns)

# [Xtr,Xte] = map(scale_reindex, [Xtr,Xte])

## run model

In [12]:
skb1 = SelectKBest(k=20)
skb1.fit(Xtr,np.ravel(ytr))
feats1 = Xtr.columns[skb1.get_support()]
feats1 = [i for i in feats1.tolist() if not i.find('page_id') > -1]

  f = msb / msw


In [13]:
feats1

['article_quality_score[T.C]',
 'article_quality_score[T.FA]',
 'article_quality_score[T.GA]',
 'article_quality_score[T.Stub]',
 'category_links',
 'cite_templates',
 'cn_templates',
 'external_links',
 'infobox_templates',
 'paragraphs_without_refs',
 'ref_tags',
 'revision_chars',
 'revision_content_chars',
 'revision_templates',
 'revision_text_bytes',
 'revision_wikilinks',
 'revision_words',
 'shortened_footnote_templates',
 'stems_length',
 'words_to_watch_matches']

### initialize and fit

In [14]:
logcv = LogisticRegressionCV(Cs=np.logspace(-4, -3, num=3), random_state=0)
logcv.fit(Xtr[feats1], ytr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(Cs=array([0.0001    , 0.00031623, 0.001     ]),
                     random_state=0)

#### check results

In [15]:
logcv.C_

array([0.001])

In [16]:
logcv.score(Xte[feats1], yte)

0.6025179856115108

In [17]:
yte.mean()

0.5

#### review fitted coefficients

In [18]:
coeffs = pd.Series(logcv.coef_.flatten(), index=feats1).squeeze().sort_values(ascending=False, key=abs)

In [19]:
coeffs

revision_templates               0.009450
external_links                  -0.005793
words_to_watch_matches           0.004358
cite_templates                   0.004084
ref_tags                         0.003973
cn_templates                     0.001658
shortened_footnote_templates    -0.000595
article_quality_score[T.Stub]   -0.000574
article_quality_score[T.C]       0.000327
revision_words                  -0.000266
infobox_templates                0.000262
category_links                   0.000229
revision_content_chars           0.000102
revision_text_bytes             -0.000057
article_quality_score[T.GA]     -0.000036
stems_length                     0.000036
revision_wikilinks               0.000034
revision_chars                  -0.000030
paragraphs_without_refs          0.000026
article_quality_score[T.FA]     -0.000021
dtype: float64