In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sample_article_urls.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,urls,wanted
0,https://medium.com/thrive-global/how-technolog...,y
1,https://deadspin.com/the-sherpa-of-new-york-18...,y
2,https://waitbutwhy.com/2016/09/marriage-decisi...,y
3,https://www.gq.com/story/golden-age-of-tax-fraud,y
4,https://www.newyorker.com/magazine/2017/04/03/...,y


In [5]:
df.urls[0]

'https://medium.com/thrive-global/how-technology-hijacks-peoples-minds-from-a-magician-and-google-s-design-ethicist-56d62ef5edf3'

In [6]:
resp_1 = requests.get(df.urls[0])

In [7]:
resp_1

<Response [200]>

In [8]:
soup_1 = BeautifulSoup(resp_1.text, 'html.parser')

In [9]:
soup_1.title

<title>How Technology is Hijacking Your Mind — from a Former Insider</title>

In [10]:
text = soup_1.find('article').text

In [11]:
df_text = pd.DataFrame([text], columns=['text'])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [13]:
vect = TfidfVectorizer(ngram_range=(1,3), stop_words = 'english')

In [14]:
vect.fit_transform(df_text['text'])

<1x4571 sparse matrix of type '<class 'numpy.float64'>'
	with 4571 stored elements in Compressed Sparse Row format>

In [15]:
vect.get_feature_names()[:10]

['10',
 '10 forecasting',
 '10 forecasting errors',
 '12',
 '12 minutes',
 '12 minutes easier',
 '140',
 '140 calories',
 '140 calories tech',
 '150']

### Larger Examples with Pocket and EmbedLy

In [16]:
df.urls[0]

'https://medium.com/thrive-global/how-technology-hijacks-peoples-minds-from-a-magician-and-google-s-design-ethicist-56d62ef5edf3'

In [17]:
requests.get('https://api.embedly.com/1/extract?url='+ df.urls[0] + '&key=e8283b5ab870410ababbadcbe5a1e837')

<Response [200]>

In [18]:
from embedly import Embedly

In [19]:
client = Embedly('')

In [20]:
client

<embedly.client.Embedly at 0x1182329b0>

In [21]:
client.oembed('https://www.fastcompany.com/3040223/when-it-clicks-it-clicks')

{'provider_url': 'https://www.fastcompany.com', 'description': "Every September, largely unbeknownst to the rest of the company, a group of around 50 Lego employees descends upon Spain's Mediterranean coast, armed with sunblock, huge bins of Lego bricks, and a decade's worth of research into the ways children play.", 'title': 'How Lego Became The Apple Of Toys', 'thumbnail_width': 1280, 'url': 'https://www.fastcompany.com/3040223/when-it-clicks-it-clicks', 'thumbnail_url': 'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/fc/3040223-poster-p-1-192-lego-when-it-clicks-it-clicks.jpg', 'version': '1.0', 'provider_name': 'Fast Company', 'type': 'link', 'thumbnail_height': 720}

In [22]:
df

Unnamed: 0,urls,wanted
0,https://medium.com/thrive-global/how-technolog...,y
1,https://deadspin.com/the-sherpa-of-new-york-18...,y
2,https://waitbutwhy.com/2016/09/marriage-decisi...,y
3,https://www.gq.com/story/golden-age-of-tax-fraud,y
4,https://www.newyorker.com/magazine/2017/04/03/...,y
5,https://www.menshealth.com/entertainment/a2252...,y
6,https://qz.com/884448/every-successful-relatio...,y
7,https://www.wired.com/story/forrest-fenn-treas...,y
8,http://www.espn.com/espn/feature/story/_/id/24...,y
9,https://www.huffingtonpost.com/entry/funniest-...,y


In [23]:
def get_html(url):
    resp = requests.get('https://api.embedly.com/1/extract?url=' + url + '&key=')
    ctnt = json.loads(resp.text).get('content')
    return ctnt

In [24]:
df.loc[:, 'html'] = df['urls'].map(get_html)

In [25]:
df

Unnamed: 0,urls,wanted,html
0,https://medium.com/thrive-global/how-technolog...,y,"<div>\n<blockquote>""It's easier to fool people..."
1,https://deadspin.com/the-sherpa-of-new-york-18...,y,"<div>\n<p>""This was the original price,"" Serap..."
2,https://waitbutwhy.com/2016/09/marriage-decisi...,y,<div>\n<p>There's not really any normal way to...
3,https://www.gq.com/story/golden-age-of-tax-fraud,y,<div>\n<p>Are you an independent contractor? H...
4,https://www.newyorker.com/magazine/2017/04/03/...,y,"<div>\n<h2>In Sweden, hundreds of refugee chil..."
5,https://www.menshealth.com/entertainment/a2252...,y,"<div>\n<iframe src=""https://hips.hearstapps.co..."
6,https://qz.com/884448/every-successful-relatio...,y,"<div>\n<p>Hey, guess what? I got married two w..."
7,https://www.wired.com/story/forrest-fenn-treas...,y,<div>\n<section><div>\n<p>Everybody is searchi...
8,http://www.espn.com/espn/feature/story/_/id/24...,y,<div>\n<p><b>HE MADE A</b> real effort to keep...
9,https://www.huffingtonpost.com/entry/funniest-...,y,"<div>\n<p><a href=""https://www.huffingtonpost...."


In [26]:
df['html'][0][:100]

'<div>\n<blockquote>"It\'s easier to fool people than to convince them that they\'ve been fooled."&#8202'

Drop NONE

In [27]:
df_clean = df[df.html.isnull() == False]

In [28]:
soup = BeautifulSoup(df_clean['html'][0], 'html.parser')

In [29]:
soup.get_text()[:100]

'\n"It\'s easier to fool people than to convince them that they\'ve been fooled."\u200a-\u200aUnknown.\nI\'m an expe'

In [30]:
def get_text(x):
    soup = BeautifulSoup(x, 'html.parser')
    text = soup.get_text()
    return text

In [31]:
get_text(df_clean.html[0])

'\n"It\'s easier to fool people than to convince them that they\'ve been fooled."\u200a-\u200aUnknown.\nI\'m an expert on how technology hijacks our psychological vulnerabilities. That\'s why I spent the last three years as a Design Ethicist at Google caring about how to design things in a way that defends a billion people\'s minds from getting hijacked.\nWhen using technology, we often focus optimistically on all the things it does for us. But I want to show you where it might do the opposite.\nWhere does technology exploit our minds\' weaknesses?\nI learned to think this way when I was a magician. Magicians start by looking for blind spots, edges, vulnerabilities and limits of people\'s perception, so they can influence what people do without them even realizing it. Once you know how to push people\'s buttons, you can play them like a piano.\nAnd this is exactly what product designers do to your mind. They play your psychological vulnerabilities (consciously and unconsciously) agains

In [32]:
df_clean.loc[:, 'text'] = df_clean['html'].map(get_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [33]:
df_clean['text'][0][:100]

'\n"It\'s easier to fool people than to convince them that they\'ve been fooled."\u200a-\u200aUnknown.\nI\'m an expe'

### Building the Classifier

In [34]:
tf = TfidfVectorizer(stop_words='english')

In [35]:
X = tf.fit_transform(df_clean.text)

In [36]:
X

<54x8687 sparse matrix of type '<class 'numpy.float64'>'
	with 37172 stored elements in Compressed Sparse Row format>

In [37]:
y = df_clean.wanted

In [38]:
y

0     y
1     y
2     y
3     y
4     y
5     y
6     y
7     y
8     y
9     y
10    y
11    y
12    y
13    y
14    y
16    y
17    y
18    y
19    y
20    y
21    y
22    y
24    y
25    y
26    y
27    y
28    y
29    n
30    n
31    n
32    n
33    n
34    n
35    n
36    n
37    n
38    n
39    n
40    n
41    n
42    n
43    n
45    n
46    n
47    n
48    n
49    n
50    n
51    n
53    n
54    n
55    n
56    n
57    n
Name: wanted, dtype: object

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
clf = LogisticRegression()

In [41]:
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
clf.score(X, y)

0.5

In [43]:
clf.predict(X)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n'], dtype=object)

In [44]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [45]:
params = {'C': [0.1, 1.0, 4.0, 10.0, 20.0, 50, 100, 1000]}

In [46]:
grid = GridSearchCV(clf, param_grid=params)

In [47]:
grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 4.0, 10.0, 20.0, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
grid.best_estimator_.predict(X)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n'], dtype=object)

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [51]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 4.0, 10.0, 20.0, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
grid.best_estimator_.predict(X_test)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n'], dtype=object)

In [53]:
y_test

27    y
21    y
31    n
25    y
41    n
56    n
34    n
22    y
1     y
4     y
54    n
19    y
7     y
8     y
Name: wanted, dtype: object

In [54]:
y_train

57    n
17    y
18    y
46    n
0     y
40    n
24    y
32    n
30    n
49    n
47    n
11    y
20    y
14    y
9     y
33    n
43    n
10    y
48    n
29    n
28    y
12    y
36    n
3     y
55    n
39    n
2     y
6     y
37    n
45    n
53    n
42    n
13    y
38    n
35    n
26    y
50    n
51    n
5     y
16    y
Name: wanted, dtype: object

In [55]:
from sklearn.model_selection import ShuffleSplit

In [56]:
cv = ShuffleSplit(n_splits=2, test_size=0.1)

In [57]:
grid = GridSearchCV(clf, param_grid=params, cv = cv)

In [58]:
grid.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=2, random_state=None, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 4.0, 10.0, 20.0, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
grid.best_estimator_.predict(X_test)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n'], dtype=object)

In [60]:
grid.score(X_test, y_test)

0.35714285714285715

In [61]:
y_test

27    y
21    y
31    n
25    y
41    n
56    n
34    n
22    y
1     y
4     y
54    n
19    y
7     y
8     y
Name: wanted, dtype: object

In [62]:
from sklearn.svm import SVC

In [63]:
svc = SVC()

In [64]:
grid = GridSearchCV(svc, param_grid=params, cv = cv)

In [65]:
grid.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=2, random_state=None, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 4.0, 10.0, 20.0, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [66]:
grid.score(X_test, y_test)

0.35714285714285715

In [67]:
grid.best_estimator_.predict(X_test)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n'], dtype=object)

In [68]:
grid.best_estimator_.predict(X_train)

array(['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
       'n'], dtype=object)

In [69]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dtree = DecisionTreeClassifier()

In [75]:
params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8]}
grid = GridSearchCV(dtree, param_grid=params, cv = cv)

In [76]:
grid.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=2, random_state=None, test_size=0.1, train_size=None),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)