In [1]:
from kedro.config import OmegaConfigLoader
from kedro.io import DataCatalog

In [2]:
config_loader = OmegaConfigLoader(conf_source=".", base_env="", default_run_env="")
catalog = DataCatalog.from_config(config_loader["catalog"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# pipe = pipeline("fill-mask", model="Twitter/twhin-bert-base")
fill_mask_model = catalog.load("fill_mask_model")

In [4]:
fill_mask_model("I have bought a new bike, I am so excited. #<mask>")

[{'score': 0.08654018491506577,
  'token': 166871,
  'token_str': 'cycling',
  'sequence': 'I have bought a new bike, I am so excited. #cycling'},
 {'score': 0.0704832673072815,
  'token': 60142,
  'token_str': 'bike',
  'sequence': 'I have bought a new bike, I am so excited. #bike'},
 {'score': 0.04027479514479637,
  'token': 712,
  'token_str': 'ad',
  'sequence': 'I have bought a new bike, I am so excited. #ad'},
 {'score': 0.020212598145008087,
  'token': 210425,
  'token_str': 'BMW',
  'sequence': 'I have bought a new bike, I am so excited. #BMW'},
 {'score': 0.018154606223106384,
  'token': 40972,
  'token_str': 'EV',
  'sequence': 'I have bought a new bike, I am so excited. #EV'}]

In [28]:
def get_hashtags(sentence, model, num_hashtags=5, mask_token="<mask>"):
    sentence_placeholder = sentence + f" #{mask_token}"
    if sentence_placeholder.count(mask_token) > 1:
        raise ValueError(f"Do not include '{mask_token} in the input sentence")

    fill_results = model(sentence_placeholder, top_k=num_hashtags)
    hashtag_results = [{
        "hashtag_str": result["token_str"],
        "score": result["score"]
    } for result in fill_results]

    return sorted(hashtag_results, key=lambda res: -res["score"])

In [29]:
get_hashtags("I have bought a new bicycle, I am so excited.", fill_mask_model)

[{'hashtag_str': 'ad', 'score': 0.05545256286859512},
 {'hashtag_str': 'cycling', 'score': 0.030171526595950127},
 {'hashtag_str': 'BMW', 'score': 0.02008751407265663},
 {'hashtag_str': 'Apple', 'score': 0.019576990976929665},
 {'hashtag_str': 'road', 'score': 0.018990855664014816}]

In [19]:
linkedin_post = """McKinsey doubles down on open source 🔥

Today, we're thrilled to announce McKinsey & Company new GitHub organization, hosting cutting-edge technologies created inside the Firm. With it, we are also open sourcing Vizro, a toolkit for creating modular data visualization applications.

Open source is not new for McKinsey: Kedro and CausalNex have been developed in the open since 2019 and 2020 respectively, and Kedro was donated to LF AI & Data Foundation in 2021 as a commitment to evolve it as an open standard. In addition, McKinsey acquired Iguazio (Acquired by McKinsey) in early 2023 and brought Nuclio and MLRun, both open source components underpinning the Iguazio platform.

I'm so excited to be part of the Firm at this moment, bringing my open source expertise to a huge organization and working side by side with brilliant professionals. I'm positive all of this wouldn't have been possible without the relentless work of Yetunde Dada, distinguised product manager and a daily source of inspiration.

In words of Rodney W. Zemmel, senior partner and global leader of McKinsey Digital, “we are on a journey to be known for our technology capabilities as much as our strategic advice”.

Let's go!"""

In [20]:
# summarizer = pipeline("summarization")
summarizer_pipe = catalog.load("summarizer_model")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [23]:
summarizer_pipe([linkedin_post, linkedin_post], max_length=56)

[{'summary_text': ' McKinsey doubles down on open source . New GitHub organization hosts cutting-edge technologies created inside the Firm . McKinsey acquired Iguazio (Acquired by McKinsey) in early 2023 and brought Nuclio and MLRun, both open source components'},
 {'summary_text': ' McKinsey doubles down on open source . New GitHub organization hosts cutting-edge technologies created inside the Firm . McKinsey acquired Iguazio (Acquired by McKinsey) in early 2023 and brought Nuclio and MLRun, both open source components'}]

In [24]:
def get_summary(text, model):
    if not isinstance(text, str):
        raise ValueError("Pass a single string")

    summary_results = model(text)
    return summary_results[0]["summary_text"]

In [25]:
get_summary(linkedin_post, summarizer_pipe)

' McKinsey doubles down on open source . New GitHub organization hosts cutting-edge technologies created inside the Firm . McKinsey acquired Iguazio (Acquired by McKinsey) in early 2023 and brought Nuclio and MLRun, both open source components underpinning the platform .'

In [30]:
get_hashtags(_25, fill_mask_model)

[{'hashtag_str': 'tech', 'score': 0.06541883200407028},
 {'hashtag_str': 'data', 'score': 0.05542179197072983},
 {'hashtag_str': 'innovation', 'score': 0.03917607665061951},
 {'hashtag_str': 'ad', 'score': 0.03191298246383667},
 {'hashtag_str': 'AI', 'score': 0.028968561440706253}]

In [31]:
get_hashtags(_25.replace("open source", "open-source"), fill_mask_model)

[{'hashtag_str': 'tech', 'score': 0.07206881791353226},
 {'hashtag_str': 'data', 'score': 0.061998412013053894},
 {'hashtag_str': 'innovation', 'score': 0.047728437930345535},
 {'hashtag_str': 'AI', 'score': 0.03327333182096481},
 {'hashtag_str': 'marketing', 'score': 0.032877687364816666}]

In [5]:
linkedin_post_es = """Si buscas una carrera como Data Scientist en España, el informe de Manfred tiene malas noticias para ti...

"Data Scientist" es literalmente el rol menos demandado de los que aparecen en la encuesta, con un 0.5 %. (El segundo menos demandado es Product Manager, ups 😬)

Sin ánimo de extrapolar conclusiones apresuradas, esto "se veía venir": el rol de Data Scientist como se imaginó a principios de la década pasada (ese unicornio en el centro de un diagrama de Venn infinito) ya no existe, la industria ha madurado, y se ha visto que hace falta mucho data engineering para siquiera poder hacer un poco de data analysis en condiciones, no digamos ya data science.

¿Qué opinas?
"""

In [4]:
%pip install sentencepiece protobuf

Collecting protobuf
  Obtaining dependency information for protobuf from https://files.pythonhosted.org/packages/fe/f3/957db80e5b9f7fd7df97e5554fdc57919dfad24e89291223fd04a0e3c84f/protobuf-4.24.3-cp37-abi3-macosx_10_9_universal2.whl.metadata
  Using cached protobuf-4.24.3-cp37-abi3-macosx_10_9_universal2.whl.metadata (540 bytes)
Using cached protobuf-4.24.3-cp37-abi3-macosx_10_9_universal2.whl (409 kB)
Installing collected packages: protobuf
Successfully installed protobuf-4.24.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
summarizer_es_pipe = catalog.load("spanish_summarizer_model")

In [6]:
summarizer_es_pipe(linkedin_post_es)

[{'summary_text': 'En España, el informe de Manfred tiene malas noticias para ti... "Data Scientist" es literalmente el rol menos demandado.'}]

In [34]:
get_summary(linkedin_post_es, summarizer_pipe)

' "Data Scientist" is el rol menos demandado de los que aparecen en la encuesta, with un 0.5% demandado . La industria ha madurado, and se ha visto that hace falta mucho mucho poder hacer un poco de data engineering .'

In [None]:
summary_es_results = summarizer(linkedin_post_es)

In [18]:
longform_sentiment_pipe = catalog.load("longform.sentiment_model")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
shortform_sentiment_pipe = catalog.load("shortform.sentiment_model")

Downloading (…)lve/main/config.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 5.55MB/s]
Downloading pytorch_model.bin: 100%|██████████| 329M/329M [00:11<00:00, 29.8MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 294/294 [00:00<00:00, 445kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 2.65MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.22MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 5.49MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.02MB/s]


In [20]:
longform_sentiment_pipe(linkedin_post)

[{'label': 'positive', 'score': 0.9231044054031372}]

In [21]:
shortform_sentiment_pipe(linkedin_post)

[{'label': 'joy', 'score': 0.8978003263473511}]

In [26]:
shortform_sentiment_pipe(linkedin_post)

[{'label': 'joy', 'score': 0.8978003263473511}]

In [28]:
language_detector_pipe = catalog.load("language_detector_model")

Downloading (…)lve/main/config.json: 100%|██████████| 1.42k/1.42k [00:00<00:00, 10.7MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:40<00:00, 27.5MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 502/502 [00:00<00:00, 1.44MB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 7.02MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 10.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 983kB/s]


In [29]:
language_detector_pipe(linkedin_post)

[{'label': 'en', 'score': 0.9330440163612366}]

In [30]:
language_detector_pipe(linkedin_post_es)

[{'label': 'es', 'score': 0.9917725920677185}]