# Word Pipeline
---

In [1]:
import time

from gazpacho import get, Soup
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn_pandas import DataFrameMapper, FunctionTransformer
from tqdm import tqdm

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def scrape(url, page=1):
    if 'datatau' in url:
        url = f'{url}/{page}'
    elif 'ycombinator' in url:
        url = f'{url}/news?p={page}'
    else:
        return
    html = get(url)
    soup = Soup(html)
    a = soup.find('a', {'class': 'storylink'})
    a = [ai.text for ai in a]
    return a

In [3]:
df = pd.DataFrame()

for w in tqdm(['https://news.ycombinator.com', 'https://datatau.net']):
    for p in [1, 2]:
        data = scrape(w, p)
        d = pd.DataFrame(data, columns=['title'])
        d['website'] = w
        df = df.append(d)
        time.sleep(1)

100%|██████████| 2/2 [00:08<00:00,  4.14s/it]


In [4]:
df = df.reset_index(drop=True)

In [5]:
df['website'] = df['website'].replace({'https://news.ycombinator.com': 1, 'https://datatau.net': 0})

In [6]:
X = df[['title']]
y = df['website']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,title
28,"The Y2K bug is back, causing headaches for dev..."
42,Away’s former CEO is returning as its co-chief
5,Why Video Gaming Will Take Over
55,iOS 13 app tracking alert has dramatically cut...
64,"AI Listens to Panda Love Sounds, Predicts Mati..."


In [9]:
def title_to_tokens(title):
    lower_case = title.lower()
    retokenizer = RegexpTokenizer(r'\w+')
    words = retokenizer.tokenize(lower_case)
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in meaningful_words]
    return " ".join(stemmed_words) 

In [10]:
mapper = DataFrameMapper([
    ('title', [FunctionTransformer(title_to_tokens), CountVectorizer()])
], df_out=True)

In [11]:
model = LogisticRegression()

In [12]:
pipe = make_pipeline(mapper, model)

In [13]:
pipe.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('dataframemapper',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[('title',
                                            [FunctionTransformer(func=None),
                                             CountVectorizer(analyzer='word',
                                                             binary=False,
                                                             decode_error='strict',
                                                             dtype=<class 'numpy.int64'>,
                                                             encoding='utf-8',
                                                             input='content',
                                                             lowercase=True,
                                                             max_df=1.0,
                                                             max_features=None,
                                     

In [14]:
pipe.score(X_test, y_test)

0.7894736842105263

In [15]:
new = pd.DataFrame({'title': ['Pijul: a distributed version control system, written in Rust (2019)']})

pipe.predict_proba(new)[0][1]

0.6312770829353793