In [1]:
%qtconsole

In [2]:
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import urllib
import numpy as np
import pandas as pd


In [3]:
source_urls = []
source_urls.append('http://revistagalileu.globo.com/Ciencia/noticia/2017/08/foto-iconica-de-albert-einstein-e-leiloada-por-125-mil-dolares.html')
source_urls.append('http://revistagalileu.globo.com/Sociedade/noticia/2017/08/dermatologista-celebridade-explica-sucesso-de-videos-de-cravos-nojentos.html')
source_urls.append('http://revistagalileu.globo.com/Game-of-Thrones/noticia/2017/08/11-reacoes-ao-vazamento-do-novo-episodio-de-game-thrones.html')
                   

                   

### Load Raw Text

In [8]:
def get_raw_text(url):
    r = urllib.urlopen(url).read()
    soup = BeautifulSoup(r, 'html.parser')
    for div in soup.find_all('div'):
        try:
            if div['class'][0] == u'ctx_content':
                text = div.get_text()
                text1 = re.sub(r'\([^\(]+\)', ' ', text)
                text2 = re.sub(r'Leia mais(.+)', '', text1)
                return text2
        except Exception:
            pass
    return u''

In [9]:
raw_texts = [get_raw_text(url) for url in source_urls]

### Feature Extraction

In [15]:
import nltk

In [19]:
raw_tokens = [nltk.word_tokenize(text) for text in raw_texts]

texts = [nltk.Text(rt) for rt in raw_tokens]

#### 1. Character count

In [27]:
Character_cnts = [len(text) for text in raw_texts]

#### 2. Raw word count including stopwords

In [29]:
Word_cnts_raw = [len(text) for text in texts]

#### 3. Word count excluding stopwords

In [31]:
stopwords = nltk.corpus.stopwords.words('portuguese')

In [338]:
filtered_tokens = [[w.lower() for w in tokens if w not in stopwords and not w.isnumeric() and len(w) > 1] for tokens in raw_tokens]
Word_cnts_filtered = [len(tokens) for tokens in filtered_tokens]

#### 4. Number of sentences and number of sentences with >70 characters

In [49]:
raw_sents = [nltk.sent_tokenize(text) for text in raw_texts]

In [58]:
Sent_cnts = [len(sents) for sents in raw_sents]

In [78]:
Long_sent_cnts = [sum([len(s) > 170 for s in sents]) for sents in raw_sents]

#### 5. Average sentence length (by token)

In [88]:
avg_sent_len = [ float(Word_cnts_raw[i]) / float(Sent_cnts[i]) for i in range(len(Word_cnts_raw))]

### POS Tagger

In [93]:
from nltk.corpus import floresta

In [97]:
Tagger0 = nltk.DefaultTagger('n')

In [98]:
def simplify_tag(t):
...     if "+" in t:
...         return t[t.index("+")+1:]
...     else:
...         return t

In [100]:
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in floresta.tagged_sents() if sent]

In [105]:
Tagger1 = nltk.UnigramTagger(tsents, backoff=Tagger0)

In [106]:
Tagger2 = nltk.BigramTagger(tsents, backoff=Tagger1)

#### Add unigram pos and bigram pos features
1. Use the first k (k=2 for now) text as training data to create all unigram pos and bigram pos used
2. For test data, only consider the unigram pos and bigram pos used in the training 

In [250]:
k = 2

#### Process training data

In [253]:
train_sents = raw_sents[:k]
test_sents = raw_sents[k:]

In [226]:
all_unigram = set()
all_bigram = set()
train_unigrams = []
train_bigrams = []
for sents in train_sents:
    tui = {}
    tbi = {}
    for sent in sents:
        tsent = Tagger2.tag(nltk.word_tokenize(sent))
        for i in range(len(tsent) - 1):
            t0 = tsent[i][1]
            t1 = tsent[i+1][1]
            all_unigram.add(t0)
            all_bigram.add((t0, t1))
            tui[t0] = tui[t0] + 1 if t0 in tui else 1
            tbi[(t0, t1)] = tbi[(t0, t1)] + 1 if (t0, t1) in tbi else 1
        t0 = tsent[len(tsent) - 1][1]
        all_unigram.add(t0)
        tui[t0] = tui[t0] + 1 if t0 in tui else 1
    train_unigrams.append(tui)
    train_bigrams.append(tbi)


Only keep those grams appear in at least L documents

In [312]:
L = 2

In [245]:
use_cnt = [(x, sum(x in tu for tu in train_unigrams)) for x in all_unigram]
use_unigram = set([x for (x,y) in use_cnt if y >L])


In [246]:
use_cnt = [(x, sum(x in tb for tb in train_bigrams)) for x in all_bigram]
use_bigram = set([x for (x,y) in use_cnt if y >L])


#### Process test data

In [255]:
test_unigrams = []
test_bigrams = []
for sents in test_sents:
    tui = {}
    tbi = {}
    for sent in sents:
        tsent = Tagger2.tag(nltk.word_tokenize(sent))
        for i in range(len(tsent) - 1):
            t0 = tsent[i][1]
            t1 = tsent[i+1][1]
            if t0 in use_unigram:
                tui[t0] = tui[t0] + 1 if t0 in tui else 1
            if (t0, t1) in use_bigram:
                tbi[(t0, t1)] = tbi[(t0, t1)] + 1 if (t0, t1) in tbi else 1
        t0 = tsent[len(tsent) - 1][1]
        if t0 in use_unigram:
            tui[t0] = tui[t0] + 1 if t0 in tui else 1
    test_unigrams.append(tui)
    test_bigrams.append(tbi)

### Convert to numpy matrix

unigram

In [286]:
train_unigram0 = [{u: tu[u] if u in tu else 0 for u in use_unigram} for tu in train_unigrams]

train_uni_mat = pd.DataFrame(train_unigram0).values

test_unigram0 = [{u: tu[u] if u in tu else 0 for u in use_unigram} for tu in test_unigrams]

test_uni_mat = pd.DataFrame(test_unigram0).values

bigram

In [289]:
train_bigram0 = [{u: tu[u] if u in tu else 0 for u in use_bigram} for tu in train_bigrams]

train_bi_mat = pd.DataFrame(train_bigram0).values

test_bigram0 = [{u: tu[u] if u in tu else 0 for u in use_bigram} for tu in test_bigrams]

test_bi_mat = pd.DataFrame(test_bigram0).values

Others

In [304]:
N = len(source_urls)

In [305]:
Character_cnt_mat = np.array(Character_cnts).reshape(N,1)
Word_cnts_raw_mat = np.array(Word_cnts_raw).reshape(N,1)
Word_cnts_filtered_mat = np.array(Word_cnts_filtered).reshape(N,1)
Sent_cnts_mat = np.array(Sent_cnts).reshape(N,1)
Long_sent_cnts_mat = np.array(Long_sent_cnts).reshape(N,1)
avg_sent_len_mat = np.array(avg_sent_len).reshape(N,1)

In [306]:
Feature_mat1 = np.concatenate((Character_cnt_mat, Word_cnts_raw_mat, Word_cnts_filtered_mat,
                              Sent_cnts_mat, Long_sent_cnts_mat, avg_sent_len_mat), axis = 1)

In [315]:
train_mat1 = Feature_mat1[:k,]
test_mat1 = Feature_mat1[k:,]

### Final feature matrix

In [317]:
train_X = np.concatenate((train_mat1, train_uni_mat, train_bi_mat), axis = 1)
test_X = np.concatenate((test_mat1, test_uni_mat, test_bi_mat), axis = 1)

### Tf-idf

In [343]:
train_filtered_tokens = filtered_tokens[:k]
test_filtered_tokens = filtered_tokens[k:]

#### TF

In [364]:
train_tf = [dict(nltk.FreqDist(tokens)) for tokens in train_filtered_tokens]
test_tf = [dict(nltk.FreqDist(tokens)) for tokens in test_filtered_tokens]

In [352]:
from functools import reduce
#product = reduce((lambda x, y: x * y), [1, 2, 3, 4])

In [361]:
use_tokens = reduce(lambda x, y: set(x).union(set(y)), train_tf)

In [365]:
train_tf0 = [{u: tf[u] if u in tf else 0 for u in use_tokens} for tf in train_tf]

train_tf_mat = pd.DataFrame(train_tf0).values

test_tf0 = [{u: tf[u] if u in tf else 0 for u in use_tokens} for tf in test_tf]

test_tf_mat = pd.DataFrame(test_tf0).values

#### Idf

In [397]:
df = [sum([u in tf for tf in train_tf]) for u in use_tokens]

In [399]:
idf = np.array([1.0/d for d in df]).reshape(1, len(use_tokens))

#### Tf-idf

In [413]:
train_tf_idf = train_tf_mat * np.repeat(idf, train_tf_mat.shape[0], axis = 0)

In [423]:
test_tf_idf = test_tf_mat * np.repeat(idf, test_tf_mat.shape[0], axis = 0)

Feed Tf-idf matrices to neural network, some proprecessing?