## Usando a biblioteca `html2text`

In [15]:
from html2text import html2text

print(html2text("<p>Hello, world.</p>"))

Hello, world.




# Teste: exemplo Syngenta
## Abrindo o site principal e coletando os links de cada semente:

In [77]:
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [78]:
def scrap_colect_links(url, class_card):
    
    """Abre o link principal da Syngenta e busca os links de cada semente"""
    
    html = urlopen(url)
    scrap = BeautifulSoup(html, 'html.parser')

    cards = scrap.find_all('div', {'class': class_card})
    
    links = []

    for link in cards:
        link = link.a['href']
        links.append(urljoin('https://www.portalsyngenta.com.br', link))
    
    return links                   

In [81]:
links = scrap_colect_links(url, 'card-text-2 card-portfolio') 
links

['https://www.portalsyngenta.com.br/sementes/nk-soja/nk-8770-ipro',
 'https://www.portalsyngenta.com.br/sementes/nk-soja/nk-7777-ipro',
 'https://www.portalsyngenta.com.br/sementes/nk-soja/nk-8448-ipro',
 'https://www.portalsyngenta.com.br/sementes/nk-soja/nk-8301-ipro',
 'https://www.portalsyngenta.com.br/sementes/nk-soja/nk-7201-ipro',
 'https://www.portalsyngenta.com.br/sementes/nk-soja/nk-6201-ipro']

In [106]:
paginas_sementes_nk = {}

for link in links: 
     
    html = urlopen(link)
    scrap = BeautifulSoup(html, 'html.parser')
    pagina = html2text(scrap.prettify()).lower()
        
    paginas_sementes_nk[link[-12:]]= pagina
    
len(paginas_sementes_nk)

6

In [108]:
i = paginas_sementes_nk['nk-8770-ipro'].find('cor da flor')
if i:
    print(i+len('cor da flor'))


19878

## Referencia do tutorial abaixo: https://towardsdatascience.com/exploratory-text-analysis-in-python-8cf42b758d9e

In [109]:
import nltk
nltk.download('punkt') # for sent_tokenize
nltk.download('stopwords') 
nltk.download('wordnet') # for WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /home/anandaheino/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anandaheino/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/anandaheino/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [110]:
# Data manipulation/analysis
import numpy as np
import pandas as pd

In [111]:
# Data partitioning
from sklearn.model_selection import train_test_split

In [112]:
# Text preprocessing/analysis
import re
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [113]:
# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", context='talk', 
        palette=['#D44D5C', '#43AA8B'])

## Transformando tudo em strings separadas:

In [120]:
test_string = paginas_sementes_nk['nk-8770-ipro'].split()
len(test_string)
print(test_string[:20])

['skip', 'to', 'main', 'content', 'pesquisa', '__', '![pesquisa](https://mediasyg.pixit.com.br/s3fs-public/search-icon-2.svg)', '[', '![facebook](https://mediasyg.pixit.com.br/s3fs-public/facebook-header.png)', '](https://www.facebook.com/syngenta/)', '[', '![instagram](https://mediasyg.pixit.com.br/s3fs-public/instagram-header.png)', '](https://www.instagram.com/syngentabrasil/)', '[', '![linkedin](https://mediasyg.pixit.com.br/s3fs-public/linkedin-header.png)', '](https://www.linkedin.com/company/syngenta/)', '[', '![youtube](https://mediasyg.pixit.com.br/s3fs-public/youtube-header.png)', '](https://www.youtube.com/user/syngentabrasil)', '[']


## Mostrando as 20 strings mais comuns:
💡 Token is a sequence of characters = words.

💡 Tokenisation is a process of splitting a document into tokens and sometimes also throwing away certain characters such as punctuation. 
* Example: Tokenisation turns ‘This movie was awesome’ into 4 tokens: [‘This’, ‘movie’, ‘was’, ‘awesome’]

In [126]:
frequentes20 = FreqDist(test_string).most_common(20)
frequentes20

[('[', 258),
 ('*', 242),
 ('|', 109),
 ('de', 91),
 ('e', 74),
 ('para', 30),
 ('a', 27),
 ('da', 25),
 ('no', 21),
 ('syngenta', 19),
 ('sementes', 18),
 ('o', 18),
 ('um', 17),
 ('voltar', 14),
 ('é', 14),
 ('®', 13),
 ('do', 13),
 ('em', 13),
 ('culturas', 12),
 ('das', 12)]

## Strings curtas: com < 4 caracteres

In [129]:
curtas3 = set(c for c in test_string if len(c)<4)

curtas3

{('*', 242),
 ('[', 258),
 ('a', 27),
 ('culturas', 12),
 ('da', 25),
 ('das', 12),
 ('de', 91),
 ('do', 13),
 ('e', 74),
 ('em', 13),
 ('no', 21),
 ('o', 18),
 ('para', 30),
 ('sementes', 18),
 ('syngenta', 19),
 ('um', 17),
 ('voltar', 14),
 ('|', 109),
 ('®', 13),
 ('é', 14)}