In [1]:
import pandas as pd

DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'

In [2]:
import sys
sys.path.append('../src')

%load_ext autoreload
%autoreload 1

In [3]:
from datatasks.sample_data import sample_data
%aimport datatasks.sample_data

In [4]:
from sklearn.pipeline import Pipeline

In [7]:
import glob
DATA_PROCESSED_PATH = DATA_PATH + 'processed/'

# Get training and test data
train_path = glob.glob(DATA_PROCESSED_PATH + 'train*.csv')[0]
val_path = glob.glob(DATA_PROCESSED_PATH + 'val*.csv')[0]

# Load training and validation data
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)

In [8]:
train.head()

Unnamed: 0,id,published-at,title,article_text,external_links,internal_links,hyperpartisan,bias,url,labeled-by,HP_links_count,nonHP_links_count,unknown_links_count,domain,preprocessed_text
0,1049192,2003-05-12,"Time Inc.'s Huey thinks outside the box, makes...","Time Inc.'s Huey thinks outside the box, makes...",{},{},False,least,https://poynter.org/news/time-incs-huey-thinks...,publisher,0,0,0,poynter,time inc huey think outside box make enemy new...
1,923492,,Texas group to open southern NM abortion clinic,Texas group to open southern NM abortion clini...,{'http://goo.gl/6s2S93': 'The Las Cruces Sun-N...,{},False,least,https://abqjournal.com/456723/texas-group-to-o...,publisher,0,0,1,abqjournal,texas group open southern nm abortion clinic l...
2,1277816,,Texas woman pleads guilty to sex trafficking 9...,Texas woman pleads guilty to sex trafficking 9...,{},{},False,least,https://abqjournal.com/1044515/texas-woman-ple...,publisher,0,0,0,abqjournal,texas woman plead guilty sex traffic year old ...
3,777348,2018-01-11,"17 dead in California mudslides, more than a d...","17 dead in California mudslides, more than a d...",{},{},False,least,https://apnews.com/amp/67ec5e87bbb74130b7ddedc...,publisher,0,0,0,apnews,dead california mudslide dozen miss montecito ...
4,89424,,County needs comprehensive effort to generate ...,County needs comprehensive effort to generate ...,{'http://www.sandovalcounty.com/uploads/Downlo...,{},False,least,https://abqjournal.com/309627/county-needs-com...,publisher,0,0,1,abqjournal,county need comprehensive effort generate need...


In [9]:
X_train = train.drop('hyperpartisan', axis=1)
y_train = train['hyperpartisan']
X_test = val.drop('hyperpartisan', axis=1)
y_test = val['hyperpartisan']

In [10]:
# CREDIT: https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
    ('selector', TextSelector(key='preprocessed_text')),
    ('tfidf', TfidfVectorizer( stop_words='english'))
])

HP_links = Pipeline([
    ('selector', NumberSelector(key='HP_links_count'))
])

nonHP_links = Pipeline([
    ('selector', NumberSelector(key='nonHP_links_count'))
])

unknown_links = Pipeline([
    ('selector', NumberSelector(key='unknown_links_count'))
])

feats = FeatureUnion([
    ('text', text),
    ('HP_links', HP_links),
    ('nonHP_links', nonHP_links)
])

pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(C=30.0, class_weight='None', solver='newton-cg')),
])

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='preprocessed_text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', i...ty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False))])

In [13]:
preds = pipeline.predict(X_test)

In [14]:
from models.models import evaluate_model

  from numpy.core.umath_tests import inner1d


In [15]:
evaluate_model(preds, y_test)

             precision    recall  f1-score   support

      False       0.71      0.48      0.57      1249
       True       0.61      0.80      0.69      1250

avg / total       0.66      0.64      0.63      2499

Accuracy: 0.6403


0.6402561024409764

In [16]:
for df in [train, val]:
    for col in ['HP_links_count', 'nonHP_links_count', 'unknown_links_count']:
        df.loc[:,col] = df.loc[:,col].apply(lambda x: 1 if x > 0 else 0)

In [85]:
train[train['unknown_links_count']==1]['hyperpartisan'].value_counts()

True     277
False    189
Name: hyperpartisan, dtype: int64

In [17]:
type(X_train)

pandas.core.frame.DataFrame