<a href="https://colab.research.google.com/github/bun760/gemini-llm/blob/main/Predictive_Topic_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping

In [None]:
!pip install langdetect newspaper3k

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.1 MB/s 
[?25hCollecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 55.8 MB/s 
Collecting cssselect>=0.9.2
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting tldextract>=2.0.1
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 4.9 MB/s 
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 39.4 MB/s 
[?25hCollecting feedparser>=5.2.1
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.6 MB/s 
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.ta

In [None]:
# used for scraping
import newspaper
from bs4 import BeautifulSoup
import requests

# check language
import langdetect

# dataframes
import pandas as pd
import numpy as np

import time
import random
import itertools

In [None]:
SLEEP_BETWEEN_CALLS = 1
MAX_PER_TAG = 30

In [None]:
def is_tag_url(url, main_url):
    # returns true if the url extracted from the main_url page should be considered as a tag
    is_tag = True
    is_tag = is_tag and ("/tag/" in url or "/tagged/" in url)
    is_tag = is_tag and "source=" not in url
    is_tag = is_tag and "/archive/" not in main_url
    return is_tag

def get_tags_from_links(main_url, links):
    # extract tags from links got in the main_url page
    tags = []
    for link in links:
        if link.has_attr("href") and is_tag_url(link["href"], main_url):
            tag = link.get_text()
            tags.append(tag)
    tags = list(set(tags))
    return tags

def scrape_article(url, min_article_text_length=1000):
    # scrape the url and return the tuple ((title, text, url, authors, publish_date, tags, id), soup)
    a = newspaper.Article(url, keep_article_html=True)
    a.download()
    a.parse()

    # extract tags from article
    soup = BeautifulSoup(a.html)
    links = soup.find_all("a")
    tags = get_tags_from_links(url, links)

    # decide if article_data have been extracted correctly
    article_data = None
    if len(a.text) >= min_article_text_length and (len(tags) > 0 and len(tags) <= 5):
        article_data = (a.title, a.text, url, a.authors, a.publish_date, tags)
    return (article_data, soup)

In [None]:
# get links to articles from archive pages
def get_links_articles(soup):
    links = soup.select('.postArticle > div > a')
    return set([l["href"].split("?source=tag_archive")[0] for l in links])

# get some dates
def get_dates():
    dates_list_str = [
       [str(el) for el in [2021, 2020, 2019]],
       [f'{el:02d}' for el in range(1, 13)][::-1],
       [f'{el:02d}' for el in range(1, 28)][::-1]
    ]

    dates_list = []
    for t in itertools.product(*dates_list_str):
        my_date = t[0] + "/" + t[1] + "/" + t[2]
        dates_list.append(my_date)

    return dates_list

# transform a tag to its url form
def urlize_tag(tag):
    # how Medium represents tags in urls
    return "-".join(tag.lower().split(" "))

In [None]:
tags_to_scrape = [
  "Artificial Intelligence", "Money", "Cybersecurity",
  "Social Media", "Programming", "Productivity"
]

In [None]:
dates_list = get_dates()
random.shuffle(dates_list)

d_new_articles = {}
d_tag_counter = { tag: 0 for tag in tags_to_scrape }

for tag, c in d_tag_counter.items():
    print(f"TAG: {tag}")
    print("")
    urlized_tag = urlize_tag(tag)

    # retrieve already collected articles with such tag
    n_collected = 0
    collected_urls = set()

    # prepare lists of newly scraped articles for such tag
    data_tag = []

    for my_date in dates_list[:10]:
        try:
            # wait some time
            time.sleep(SLEEP_BETWEEN_CALLS)

            # scrape page with archived articles with such tag
            tag_soup = BeautifulSoup(requests.get(f"https://medium.com/tag/{urlized_tag}/archive/{my_date}").text)

            # extract all the links to articles with such tag, if the article has not been scraped already
            links_articles = [l for l in get_links_articles(tag_soup) if l not in collected_urls]

            for url in links_articles[:10]:
                # scrape single article
                article_data, soup = scrape_article(url)

                # if scraped correctly
                if article_data is not None:
                    # check language of article
                    article_text = article_data[1]
                    article_language = langdetect.detect(article_text)
                    if article_language == "en":
                        data_tag.append(article_data)
                        n_collected += 1
                        collected_urls.add(url)
                        d_new_articles[tag] = data_tag
                        print(f"Articles collected: {n_collected}")
                        print(article_data[0])
                        print(article_data[2])
                        print(article_data[5])
                        print("")
                    else:
                        print(f"Article language is {article_language}")
                        print(article_data[0])
                        print("")

                if n_collected >= MAX_PER_TAG:
                    break
            if n_collected >= MAX_PER_TAG:
                    break
        except KeyboardInterrupt:
            print('Stopped scraping!')
            time.sleep(3)
            break
        except Exception as e:
            print(e)

TAG: Artificial Intelligence

Articles collected: 1
Let’s Learn, How Machines are Trained? – ‘Machine Learning’ the Ultimate Sheen
https://medium.com/@sunilrajthota/lets-learn-how-machines-are-trained-machine-learning-the-ultimate-sheen-2be5a3ff2db2
['Algorithms', 'ML', 'Artificial Intelligence', 'AI', 'Machine Learning']

Articles collected: 2
Databaiting
https://towardsdatascience.com/databaiting-d26cad4c49ca
['Data Science', 'Artificial Intelligence', 'Privacy']

Articles collected: 3
The Fourth Wave — or: How to Create the Future of Digital Health Care
https://medium.com/next-level-german-engineering/the-fourth-wave-or-how-to-create-the-future-of-digital-health-care-81d2eb283492
['Artificial Intelligence', 'Digital Health', 'Health Technology', 'Digital Solutions', 'Health']

Articles collected: 4
Book Review: Artificial Intelligence in Healthcare by Parag Mahajan
https://medium.com/the-book-channel/book-review-artificial-intelligence-in-healthcare-by-parag-mahajan-f0ca44dda1e7
['A

In [None]:
d_new_articles

{'Artificial Intelligence': [('Let’s Learn, How Machines are Trained? – ‘Machine Learning’ the Ultimate Sheen',
   'Introduction:\n\nMachine learning (ML) is a category of algorithm that allows software applications to become more accurate in predicting outcomes without being explicitly programmed. The basic premise of ML is to build algorithms that can receive input data and use statistical analysis to predict an output. ML is an application of AI that provides systems the ability to automatically learn and train. It focuses on the development of computer programs that can access data and use it to learn for themselves.\n\nThe process of learning begins with observations or data, such as direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers to learn automatically without human intervention or assistance and adjust actions accordingly.\n\nML algori

In [None]:
# save scraped data to file
data = [a for v in d_new_articles.values() for a in v]
df = pd.DataFrame.from_records(data, columns=["title", "text", "url", "authors", "timestamp", "tags"])
df = df.drop_duplicates("url")
print(f"Num of articles: {len(df)}")
df.to_csv("medium_articles.csv")

Num of articles: 175


# Training

In [None]:
!pip install nltk



In [None]:
import numpy as np
import pandas as pd
import datetime

# preprocessing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
import string

# for model training
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def get_df_all_articles_medium(data_path):
    df = pd.read_csv(data_path, index_col=0)
    df["tags"] = df["tags"].apply(lambda t: eval(t) if type(t) == type("") else t)
    df["authors"] = df["authors"].apply(lambda t: eval(t) if type(t) == type("") else t)
    return df

In [None]:
# get dataset of articles
df = get_df_all_articles_medium("medium_articles.csv")

In [None]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,"Let’s Learn, How Machines are Trained? – ‘Mach...",Introduction:\n\nMachine learning (ML) is a ca...,https://medium.com/@sunilrajthota/lets-learn-h...,[Sunil Raj Thota],2019-07-07 14:17:28.483000+00:00,"[Algorithms, ML, Artificial Intelligence, AI, ..."
1,Databaiting,Databaiting\n\nThe process of pulling in a mem...,https://towardsdatascience.com/databaiting-d26...,[Alex Moltzau 莫战],2019-07-08 03:49:54.995000+00:00,"[Data Science, Artificial Intelligence, Privacy]"
2,The Fourth Wave — or: How to Create the Future...,Innovation in health care: Wave 1 and 2\n\nIn ...,https://medium.com/next-level-german-engineeri...,[Next Visions],2019-07-07 11:59:58.982000+00:00,"[Artificial Intelligence, Digital Health, Heal..."
3,Book Review: Artificial Intelligence in Health...,I feel liberated whenever I got the opportunit...,https://medium.com/the-book-channel/book-revie...,[],2019-07-07 12:34:10.199000+00:00,"[Artificial Intelligence, Digital Healthcare, ..."
4,Interview Of The Week: Anand Rao,"Anand Rao, PwC’s Global AI Leader, was a speak...",https://innovator.news/interview-of-the-week-a...,[Jennifer L. Schenker],2019-07-07 22:07:24.734000+00:00,"[Ai And Ethics, Artificial Intelligence, Inter..."


In [None]:
def get_training_df(df, all_tags, max_num_per_tag=30):
    df_training = df.copy()
    df_training["tags"] = df_training["tags"].apply(lambda tags: [tag for tag in tags if tag in all_tags])
    return df_training

def get_fitted_binarizer_and_labels(df_training):
    binarizer = MultiLabelBinarizer()
    labels = binarizer.fit_transform(df_training["tags"])
    return binarizer, labels

In [None]:
# Create training df
all_tags = ["Artificial Intelligence", "Money", "Cybersecurity", "Social Media", "Programming", "Productivity"]
df_training = get_training_df(df, all_tags, 30)

# train binarizer
binarizer, labels = get_fitted_binarizer_and_labels(df_training)
len(df_training)

175

In [None]:
X_train = df_training
y_train = labels

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text, replace_by_space_re, bad_symbols_re, stopwords_set,
               keep_first_n_words=1000):
    # normalize text
    text = text.lower()
    text = text.replace("\n", " ")
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ''.join([ch for ch in text if ch not in string.punctuation])

    # remove stopwords
    text = ' '.join(word for word in text.split() if word not in stopwords_set)

    # keep first n words
    text = " ".join(text.split(" ")[:keep_first_n_words])

    return text

def preprocessing_pipeline(X, vectorizer, keep_first_n_words=1000):
    X["title"] = X["title"].apply(lambda t: "" if type(t) != str else t)
    X["text"] = X["text"].apply(lambda t: "" if type(t) != str else t)
    text_array = X["title"] + ".\n" + X["text"]
    text_array_cleaned = text_array.apply(lambda text: clean_text(text, REPLACE_BY_SPACE_RE,
                                                    BAD_SYMBOLS_RE, STOPWORDS,
                                                    keep_first_n_words=keep_first_n_words))
    text_array_vectorized = vectorizer.transform(text_array_cleaned)

    return text_array_vectorized

def get_fitted_vectorizer(X, max_df=1.0, min_df=1, keep_first_n_words=1000):
    X["title"] = X["title"].apply(lambda t: "" if type(t) != str else t)
    X["text"] = X["text"].apply(lambda t: "" if type(t) != str else t)
    text_array = X["title"] + ".\n" + X["text"]
    text_array_cleaned = text_array.apply(lambda text: clean_text(text, REPLACE_BY_SPACE_RE,
                                                    BAD_SYMBOLS_RE, STOPWORDS,
                                                    keep_first_n_words=keep_first_n_words))

    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df)
    vectorizer.fit(text_array_cleaned)
    return vectorizer

In [None]:
vectorizer = get_fitted_vectorizer(X_train)

In [None]:
X_train_preproc = preprocessing_pipeline(X_train, vectorizer=vectorizer)

In [None]:
def get_fitted_model(X_train_preproc, y_train, class_weight="balanced"):
    model = MultiOutputClassifier(LogisticRegression(random_state=1, class_weight=class_weight), n_jobs=-1)
    model.fit(X_train_preproc, y_train)
    return model

In [None]:
model = get_fitted_model(X_train_preproc, y_train)

In [None]:
X = pd.DataFrame([["", ""]], columns=["text", "title"])

In [None]:
X = pd.DataFrame([["How to make money using neural networks to predict stock prices.", "Artificial intelligence and business!"]], columns=["text", "title"])

In [None]:
X = pd.DataFrame([["How to train a neural network to predict topics for texts", "Artificial Intelligence tutorial."]], columns=["text", "title"])
X_preproc = preprocessing_pipeline(X, vectorizer=vectorizer)
predictions = model.predict(X_preproc)
binarizer.inverse_transform(predictions)

[()]

In [None]:
title = "Networking connections"
text = "What are the most famous Social Networking sites to build connections?"
X = pd.DataFrame([[text, title]], columns=["text", "title"])
X_preproc = preprocessing_pipeline(X, vectorizer=vectorizer)
predictions = model.predict(X_preproc)
binarizer.inverse_transform(predictions)

[('Social Media',)]

In [None]:
title = "The Data Science tutorial"
text = "Preparing data basically."
X = pd.DataFrame([[text, title]], columns=["text", "title"])
X_preproc = preprocessing_pipeline(X, vectorizer=vectorizer)
predictions = model.predict(X_preproc)
binarizer.inverse_transform(predictions)

[('Artificial Intelligence', 'Programming')]