# predict hoax - pooled (SHOULDN'T POOL PAIRED DATA)

## get data

### imports

In [1]:
import os, re, patsy
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
path = '/home/bhrdwj/git/predwikt/data/raw/wiki_reliability/unzipped/'

In [2]:
dif = pd.read_csv(path+'hoax_difftxt.csv', usecols=lambda x: x not in ['Unnamed: 0'])
ful = pd.read_csv(path+'hoax_fulltxt.csv', usecols=lambda x: x not in ['Unnamed: 0'])
fea = pd.read_csv(path+'hoax_features.csv', usecols=lambda x: x not in ['Unnamed: 0'])

fea = fea.set_index('revision_id')

In [3]:
dif.shape[0], ful.shape[0], fea.shape[0]

(1385, 1386, 2788)

### denormalize for analysis

#### Create denormalized dataframe

In [4]:
# Separate positive and negative observations, assign dummy vars, rename columns for concatenation
ful_pos = ful[['revision_id_pos', 'txt_pos', 'page_id']].set_index('revision_id_pos').assign(hoax=1).rename(columns={'txt_pos':'ful'})
ful_neg = ful[['revision_id_neg', 'txt_neg', 'page_id']].set_index('revision_id_neg').assign(hoax=0).rename(columns={'txt_neg':'ful'})
dif_pos = dif[['revision_id_pos', 'difftxt_pos', 'page_id']].set_index('revision_id_pos').assign(hoax=1).rename(columns={'difftxt_pos':'dif'})
dif_neg = dif[['revision_id_neg', 'difftxt_neg', 'page_id']].set_index('revision_id_neg').assign(hoax=0).rename(columns={'difftxt_neg':'dif'})

# Concatenate positive and negative observations to taller form
fea = fea.astype({'page_id':str}).rename(columns={'has_template':'hoax'})
ful = pd.concat((ful_pos, ful_neg)).astype({'page_id':str})
dif = pd.concat((dif_pos, dif_neg)).astype({'page_id':str})

# Join the text data onto the features data
df = fea.join(ful, rsuffix='_ful').join(dif, rsuffix='_dif', lsuffix='_fea')

#### Remove redundant page_id columns

In [5]:
# Confirm that page_id is always a match whenever it isn't NaN, for fully non-null rows
(df[df.columns[df.columns.str.startswith('page_id')]]
    .loc[~df.isna().any(axis=1)]
    .loc[(df.page_id_fea != df.page_id_ful) | (df.page_id_ful != df.page_id_dif)]
    .pipe(lambda x: print(f"There are {x.shape[0]} rows where page_id doesn't match\n")))

# Check which page_id columns have how-many nulls
df[df.columns[df.columns.str.startswith('page_id')]].isna().sum().pipe(print)

# Drop the redundant page_id columns
df = df[df.columns.difference(['page_id_ful', 'page_id_dif'])].rename(columns={'page_id_fea':'page_id'})

There are 0 rows where page_id doesn't match

page_id_fea     0
page_id_ful    16
page_id_dif    18
dtype: int64


#### Remove redundate hoax columns

In [6]:
# Confirm that hoax is similarly a match
(df[df.columns[df.columns.str.startswith('hoax')]]
    .loc[~df.isna().any(axis=1)] 
    .loc[(df.hoax_fea != df.hoax_dif)] 
    .pipe(lambda x: print(f"There are {x.shape[0]} rows where hoax doesn't match\n")))

# Check which hoax columns have how-many nulls
df[df.columns[df.columns.str.startswith('hoax')]].isna().sum().pipe(print)

# Drop the redundant hoax columns
df = df[df.columns.difference(['hoax_dif', 'hoax_ful'])].rename(columns={'hoax_fea':'hoax'})

There are 0 rows where hoax doesn't match

hoax_dif    18
hoax_fea     0
hoax_ful    16
dtype: int64


#### Confirm that no observations were lost, and clean namespace

In [7]:
df.shape

(2788, 25)

In [8]:
del fea, ful, dif, ful_pos, ful_neg, dif_pos, dif_neg

### data dictionary

In [9]:
mtdt = pd.read_csv('../data/raw/schemas/schema_wiki-reliability.tsv', sep='\t').set_index('Field').squeeze()
mtdt.values

array(['Page ID of the revision', 'ID of the revision',
       'ID of the corresponding pos/neg revision',
       'Change in bytes of revision text',
       'Average length of stemmed text', 'Count of images in tags',
       'Count of infobox templates',
       'Total length of paragraphs without references',
       'Number of shortened footnotes (i.e., citations with page numbers linking to the full citation for a source)',
       "Count of matches from Wikipedia's words to watch: words that are flattering, vague or endorsing a viewpoint",
       'Count of words for the revision',
       'Number of characters in the full article',
       'Number of characters in the content section of an article',
       'Count of external links not in Wikipedia',
       'Count of level-2 headings',
       'Count of reference tags, indicating the presence of a citation',
       'Count of links to pages on Wikipedia',
       'Letter grade of article quality prediction',
       'Count of templates that 

## Predict pooled without text (1)

### data prep

In [10]:
df1 = df[df.columns.difference(['dif','ful', 'revision_id.key', 'revision_text_bytes'])]
Xcols1 = df1.columns.difference(['hoax', 'headings_by_level(2)'])

In [11]:
y,X = patsy.dmatrices('hoax ~ '+' + '.join(Xcols1), data=df1, return_type='dataframe')

In [12]:
Xtr, Xte, ytr, yte = train_test_split(X,y, test_size=.2, random_state=0)

In [13]:
# scaler = StandardScaler()
# def scale_reindex(df):
#     matrix = scaler.fit_transform(df)
#     return pd.DataFrame(data=matrix, index=df.index, columns=df.columns)

# [Xtr,Xte] = map(scale_reindex, [Xtr,Xte])

### select features

In [14]:
from sklearn.feature_selection import SelectKBest

In [15]:
skb1 = SelectKBest(k=20)
skb1.fit(X,np.ravel(y))
feats1 = X.columns[skb1.get_support()]
feats1 = [i for i in feats1.tolist() if not i.find('page_id') > -1]

  f = msb / msw


In [16]:
feats1

['article_quality_score[T.C]',
 'cite_templates',
 'cn_templates',
 'external_links',
 'paragraphs_without_refs',
 'revision_chars',
 'revision_content_chars',
 'revision_templates',
 'revision_wikilinks',
 'revision_words',
 'stems_length',
 'words_to_watch_matches']

### initialize and fit

In [17]:
logcv = LogisticRegressionCV(Cs=np.logspace(-4, -3, num=3), random_state=0)
logcv.fit(Xtr[feats1], np.ravel(ytr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(Cs=array([0.0001    , 0.00031623, 0.001     ]),
                     random_state=0)

#### check results

In [18]:
logcv.C_

array([0.001])

In [19]:
logcv.score(Xte[feats1], yte)

0.6140035906642729

In [20]:
yte.mean()

hoax    0.517056
dtype: float64

#### review fitted coefficients

In [21]:
coeffs = pd.Series(logcv.coef_.flatten(), index=feats1).squeeze().sort_values(ascending=False, key=abs)

In [22]:
coeffs

external_links               -0.011305
revision_templates            0.008766
words_to_watch_matches        0.005868
cn_templates                  0.001983
revision_words               -0.000922
cite_templates                0.000773
article_quality_score[T.C]    0.000627
revision_wikilinks            0.000340
revision_content_chars        0.000112
stems_length                  0.000092
revision_chars               -0.000030
paragraphs_without_refs       0.000013
dtype: float64