In [None]:
! pip install -U "transformers[torch]" pypdf python-docx langdetect sentencepiece sacremoses

In [None]:
import os
from langdetect import detect
from huggingface_hub import HfApi
from text_extractor import TextExtractor
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [None]:
model_name = "facebook/bart-large-cnn"
# model_name = "google-t5/t5-small"
summarizer = pipeline(task="summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

translator_to_french = pipeline(
    task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
)
translator_to_english = pipeline(
    task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
)

In [None]:
text_extractor = TextExtractor("data/25-PAGE-FRENCH-TEXT.pdf")
text, word_count = text_extractor.get_text()
summary_length = int(word_count / 2)

try:
    summary = summarizer(text, max_length=summary_length, do_sample=False)[0][
        "summary_text"
    ]
except Exception as ex:
    max_length = tokenizer.model_max_length
    inputs = tokenizer(
        text, truncation=True, max_length=max_length, return_tensors="pt"
    )

    summary_ids = model.generate(
        inputs["input_ids"], num_beams=4, max_length=summary_length, early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
summary

In [None]:
detected_lang = detect(summary)
detected_lang

## Update space

In [None]:
token = os.getenv("HUGGINFACE_TOKEN")
repo_id = "noahnsimbe/text-summarizer"

api = HfApi(token=token)
api.upload_file(
    path_or_fileobj="requirements.txt",
    path_in_repo="requirements.txt",
    repo_id=repo_id,
    repo_type="space",
)
api.upload_file(
    path_or_fileobj="app.py",
    path_in_repo="app.py",
    repo_id=repo_id,
    repo_type="space",
)
api.upload_file(
    path_or_fileobj="text_extractor.py",
    path_in_repo="text_extractor.py",
    repo_id=repo_id,
    repo_type="space",
)
api.upload_file(
    path_or_fileobj="space-README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    repo_type="space",
)