In [None]:
!pip install fake_useragent

In [None]:
import requests
import pandas as pd
import time
import json
from fake_useragent import UserAgent
import logging
from tqdm import tqdm

In [None]:
logging.basicConfig(level=logging.INFO)

## Setting requests up

In [None]:
url = "https://conpletus.cointelegraph.com/v1/"

user_agent = UserAgent()

headers_1 = {
  'accept': 'application/graphql-response+json, application/graphql+json, application/json, text/event-stream, multipart/mixed',
  'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ru;q=0.7',
  'content-type': 'application/json',
  'origin': 'https://cointelegraph.com',
  'priority': 'u=1, i',
  'referer': 'https://cointelegraph.com/',
  'sec-fetch-dest': 'empty',
  'sec-fetch-mode': 'cors',
  'user-agent': user_agent.chrome
}

In [None]:
def get_payload(length=15, offset=0):
    payload_template = r"""
    {{
      "query": "query TagPageQuery($short: String, $slug: String!, $order: String, $offset: Int!, $length: Int!) {{\n  locale(short: $short) {{\n    tag(slug: $slug) {{\n      id\n      slug\n      avatar\n      createdAt\n      updatedAt\n      redirectRelativeUrl\n      alternates {{\n        short\n        domain\n        id\n        code\n        __typename\n      }}\n      tagTranslates {{\n        id\n        title\n        metaTitle\n        pageTitle\n        description\n        metaDescription\n        keywords\n        __typename\n      }}\n      posts(order: $order, offset: $offset, length: $length) {{\n        data {{\n          id\n          slug\n          views\n          postTranslate {{\n            id\n            title\n            avatar\n            published\n            publishedHumanFormat\n            leadText\n            author {{\n              id\n              slug\n              authorTranslates {{\n                id\n                name\n                __typename\n              }}\n              __typename\n            }}\n            __typename\n          }}\n          category {{\n            id\n            slug\n            __typename\n          }}\n          author {{\n            id\n            slug\n            authorTranslates {{\n              id\n              name\n              __typename\n            }}\n            __typename\n          }}\n          postBadge {{\n            id\n            label\n            postBadgeTranslates {{\n              id\n              title\n              __typename\n            }}\n            __typename\n          }}\n          showShares\n          showStats\n          __typename\n        }}\n        postsCount\n        __typename\n      }}\n      __typename\n    }}\n    __typename\n  }}\n}}",
      "variables": {{"cacheTimeInMS": 300000, "length":{length}, "offset":{offset}, "order": "postPublishedTime", "short": "en", "slug": "bitcoin"}}
    }}
    """
    return payload_template.format(length=length, offset=offset)


## Scraping and parsing articles from https://cointelegraph.com/ with tag "Bitcoin"

In [None]:
request_batch_size = 500

articles = []

for it in tqdm(range(20)):
    payload = get_payload(length=request_batch_size, offset=it * request_batch_size)
    resp = requests.request("POST", url, headers=headers_1, data=payload)
    if resp.status_code != 200:
        logging.error(f"Status code is not equal to 200 at batch {it}")
        continue
    resp_articles = json.loads(resp.content)["data"]["locale"]["tag"]["posts"]["data"]
    for resp_article in resp_articles:
        article = {
            "title": resp_article["postTranslate"]["title"],
            "url": "https://cointelegraph.com/news/" + resp_article["slug"],
            "published_time": resp_article["postTranslate"]["published"],
            "views": resp_article["views"]
        }
        articles.append(article)
        

In [None]:
logging.info(f"Scraped info about {len(articles)} articles")

## Scraping and parsing articles' texts 

In [None]:
from bs4 import BeautifulSoup

In [None]:
headers_2 = {
    "user-agent": user_agent.chrome
}

for article in tqdm(articles):
    article["text"] = ""
    resp = requests.request("GET", url=article["url"], headers=headers_2)
    if resp.status_code != 200:
        logging.error(f"Status code of GET to article {article['title']} is not 200")
        logging.error(f"link: {article['url']}")
        continue
    soup = BeautifulSoup(resp.content, "lxml")
    article_body = soup.find("article", class_="post__article")
    paragraphs = [str(p.text.strip()) for p in article_body.find_all(["p", "h2", "h1"])]
    text = " ".join(paragraphs)
    article["text"] = text

In [None]:
with open("CoinTelegraph_Articles.json", "w") as output_file:
    json.dump(articles, output_file)