# Infomoney News Scraping


Making the necessary imports


In [None]:
import multiprocessing
from datetime import datetime
from multiprocessing.managers import ListProxy
from typing import Any

import pandas as pd
import requests
from bs4 import BeautifulSoup, ResultSet, Tag
from numpy import Series, nan
from pandas import DataFrame
from requests import Response

## Getting the data


Setting up a `numpy` series contaning all the links.


In [None]:
df: DataFrame = pd.read_csv("infomoney-links.csv")
links: Any = df["link"].values.tolist()

Creating a multiprocess workflow to make requests and parse then using `BeautifulSoup` for the information that we're interested in the news.

> An interest point of discution and improvement would be the usage of `lxml` instead of `html.parser`, since it should be faster.

After each process finishes running we have a list of dicts containing all the information regarding that news.


In [None]:
def worker(index, input_list, output_list):
    url: Any = input_list[index]
    response: Response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    paragraph: ResultSet[Tag] = soup.select(
        "div.element-border--bottom:nth-child(1) > p:nth-child(n+2)"
    )
    subtitles: ResultSet[Tag] = soup.select(".single__excerpt > p:nth-child(1)")
    headers: ResultSet[Tag] = soup.select(".typography__display--2")
    tags: list[str] = [
        x.get_text() for x in soup.select(".single__tag-list > ul > li > a")
    ]
    author: ResultSet[Tag] | list[str] | str = soup.select(
        ".single__author-info > span:nth-child(1) > a"
    )
    dates: ResultSet[Tag] = soup.select(".entry-date")
    text: str = ""

    if author:
        author = [x.get_text() for x in author]
    else:
        author = ""

    if dates:
        date: str = dates[0].get_text()
    else:
        date = ""

    if subtitles:
        subtitle: str = subtitles[0].get_text()
    else:
        subtitle = ""

    if headers:
        header: str = headers[0].get_text()
    else:
        header = ""

    for p in paragraph:
        text += p.get_text()
    text = text.replace("\xa0", "")
    text = text.replace("CONTINUA DEPOIS DA PUBLICIDADE", "")
    text = text.replace("\n\nRelacionados\n", "")

    output_list[index] = {
        "text": text,
        "title": header,
        "subtitle": subtitle,
        "tags": tags,
        "date": date,
        "author": author,
    }


input_list = links
output_list: ListProxy[None] = multiprocessing.Manager().list([None] * len(input_list))

if __name__ == "__main__":
    num_processes: int = multiprocessing.cpu_count()

    with multiprocessing.Pool(processes=num_processes) as pool:
        pool.starmap(
            worker, [(i, input_list, output_list) for i in range(len(input_list))]
        )

result_multi: ListProxy[Any] = output_list

For auditing purposes we will be exporting the raw data collected to a csv file.


In [None]:
result: list[Any] = [x for x in result_multi]
df = pd.DataFrame(result)
df.to_csv("infomoney-data.csv")

## Cleaning the data


Here we are standardizing the data so all dates follow the `datetime` specifications so we can transform then in datetime for better consumption.

> Regarding the `formatTo` function, it was created for the specif case of **Infomoney** since all news dates are in pt-BR and for then to be processed by `datetime.datetime.strptime` they need to be in english.
> Also, it treats all the found exceptions, **this code may need to be updated if the website changes it's structure**.


In [None]:
def formatTo(x: str):
    month: str = ""
    if len(x) == 16:
        month = x[2:5]
    else:
        month = x[3:6]

    if month == "jan":
        x = x.replace("jan", "Jan")

    elif month == "fev":
        x = x.replace("fev", "Feb")

    elif month == "mar":
        x = x.replace("mar", "Mar")

    elif month == "abr":
        x = x.replace("abr", "Apr")

    elif month == "mai":
        x = x.replace("mai", "May")

    elif month == "jun":
        x = x.replace("jun", "Jun")

    elif month == "jul":
        x = x.replace("jul", "Jul")

    elif month == "ago":
        x = x.replace("ago", "Aug")

    elif month == "set":
        x = x.replace("set", "Sep")

    elif month == "out":
        x = x.replace("out", "Oct")

    elif month == "nov":
        x = x.replace("nov", "Nov")

    elif month == "dez":
        x = x.replace("dez", "Dec")

    if "Mayo" in x:
        x = x.replace("Mayo", "May")

    if "maio" in x:
        x = x.replace("maio", "May")

    if x == "":
        return nan

    return datetime.strptime(x, "%d %b %Y %Hh%M")


df["date"] = df["date"].apply(formatTo)

Cleaning all rows that contain no text, only whitespaces or no tags since they are missing important information.

> It could be argued that news without tags are still useful. However, since we do not have enough resources to process so many news items at once, it is better if we ignore those cases.


In [None]:
mask: Series[bool] = (
    (df["text"] == "") | df["text"].str.match("\s+") | (df["tags"] == "[]")
)
df: DataFrame = df.drop(df[mask].index).reset_index(drop=True)
df.count()

Removing all instances that could have passed the scraping process which are not in the wanted time frame.


In [None]:
mask: Series[Any] = df["date"].apply(
    lambda x: x.year < 2022 or x.year == 2022 and x.month < 9
)
df: DataFrame = df.drop(df[mask].index).reset_index(drop=True)
df.count()

Cleaning the author column.


In [None]:
df["author"] = df["author"].apply(
    lambda x: [
        author.replace("\n", "").replace("\t", "").replace(",", "") for author in x
    ]
)

df

Exporting to a csv.
