# **Text Summarization**


In [3]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = """The global shift toward renewable energy is accelerating as countries seek to reduce carbon emissions and combat climate change.
 Solar and wind power have become increasingly cost-effective, leading to widespread adoption across both developed and developing nations.
  Governments are investing heavily in infrastructure, offering subsidies and tax incentives to encourage clean energy production.
  Meanwhile, technological advancements in battery storage and smart grids are helping to address the intermittency of renewable sources.
  Despite these gains, challenges remain—such as the need for rare earth materials, land use conflicts, and resistance from traditional energy sectors.
  Nonetheless, the momentum toward a greener future appears unstoppable."""
  #short summary
print(summarizer(text, max_length=30, min_length=10)[0]['summary_text'])
  #medium summary
print(summarizer(text, max_length=60, min_length=30)[0]['summary_text'])
  #Long summary
print(summarizer(text, max_length=100, min_length=50)[0]['summary_text'])

Device set to use cpu


The global shift toward renewable energy is accelerating as countries seek to reduce carbon emissions and combat climate change. Solar and wind power have become increasingly
The global shift toward renewable energy is accelerating as countries seek to reduce carbon emissions and combat climate change. Solar and wind power have become increasingly cost-effective, leading to widespread adoption across both developed and developing nations.
The global shift toward renewable energy is accelerating as countries seek to reduce carbon emissions and combat climate change. Solar and wind power have become increasingly cost-effective, leading to widespread adoption across both developed and developing nations. Governments are investing heavily in infrastructure, offering subsidies and tax incentives to encourage clean energy production.


# **Headline Generator**

In [4]:
headline = summarizer(text, max_length =15, min_length=5)[0]['summary_text']
print("Generated Headline:",headline)

Generated Headline: The global shift toward renewable energy is accelerating. Solar and wind


# Question **Answering**

In [6]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
questions = ["What is the main topic?",
             "When ddid this event happen?",
             "Who were the ke people involved?"]
for question in questions:
  answer = qa_pipeline(question = question, context = text)
  print(f"Question: {question}")
  print(f"Answer: {answer['answer']}")
  print()


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Question: What is the main topic?
Answer: climate change

Question: When ddid this event happen?
Answer: the momentum toward a greener future appears unstoppable

Question: Who were the ke people involved?
Answer: Governments



# **Build a Simple FAQ bot**

In [7]:
faq_context = """Q1: What is O Simple?
O Simple is a user-friendly tool designed to simplify everyday tasks like note-taking, scheduling, and reminders.
Q2: How do I get started?
Just download the app, create an account, and follow the quick-start guide.
Q3: Is O Simple free to use?
Yes, it offers a free version with core features. Premium plans unlock advanced tools.
"""
while True:
  question = input("Ask a question (or type 'exit' to quit): ")
  if question.lower() == 'exit':
    break
  answer = qa_pipeline(question = question, context = faq_context)
  print(f"Question: {question}")
  print(f"Answer: {answer['answer']}")
  print()


Ask a question (or type 'exit' to quit): what is o simple
Question: what is o simple
Answer: a user-friendly tool

Ask a question (or type 'exit' to quit): more
Question: more
Answer: How do I get started

Ask a question (or type 'exit' to quit): how do i get startd
Question: how do i get startd
Answer: a user-friendly tool

Ask a question (or type 'exit' to quit): exit


Q&A on Uploaded Document(PDF)

In [8]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [11]:
import PyPDF2
pdf_text = ""
with open("DATAMINING_MOD1 (1).pdf", "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page in pdf_reader.pages:
        pdf_text += page.extract_text()
question = "Summarize the document"
answer = qa_pipeline(question = question, context = pdf_text)
print(f"Question: {question}")
print(f"Answer: {answer['answer']}")


Question: Summarize the document
Answer: dimension table


# **Text Summarization Comparing different model output**

In [12]:
from transformers import pipeline
bart = pipeline("summarization", model="facebook/bart-large-cnn")
t5 = pipeline("summarization", model="t5-small")
text = """ In recent years, the rise of remote work has transformed the way companies operate and employees engage with their jobs.
With advancements in communication technologies and collaboration tools, teams can now work effectively across time zones and continents.
This shift has led to increased flexibility, allowing workers to better balance their personal and professional lives.
However, it has also introduced challenges such as maintaining team cohesion, managing productivity, and ensuring cybersecurity.
 As organizations adapt to this new normal, many are rethinking traditional office structures and investing in hybrid models that combine remote
 and in-person work."""
print("BART:", bart(text, max_length=100, min_length=30)[0]['summary_text'])
print("T5:", t5(text, max_length=100, min_length=30)[0]['summary_text'])


Device set to use cpu


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


BART: In recent years, the rise of remote work has transformed the way companies operate and employees engage with their jobs. With advancements in communication technologies and collaboration tools, teams can now work effectively across time zones and continents.


Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


T5: the rise of remote work has transformed the way companies operate and employees engage with their jobs . this shift has led to increased flexibility, allowing workers to better balance their personal and professional lives . however, it also introduced challenges such as maintaining team cohesion, managing productivity, and ensuring cybersecurity .


# Topic Based **Summaries**

In [13]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = """Remote work has reshaped modern workplaces by enabling teams to collaborate across distances using digital tools.
 It offers flexibility and better work-life balance, but also brings challenges like maintaining productivity and team connection.
 As companies adapt, many are embracing hybrid models that blend remote and in-person work for greater efficiency and employee satisfaction"""
topic = "Work culture"
prompt = f"Summarize the topic '{topic}' in the following text:\n{text}"
summary = summarizer(prompt, max_length=100, min_length=30)[0]['summary_text']
print(f"Topic: {topic}")
print(f"Summary: {summary}")

Device set to use cpu
Your max_length is set to 100, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


Topic: Work culture
Summary: Remote work has reshaped modern workplaces by enabling teams to collaborate across distances. It offers flexibility and better work-life balance, but also brings challenges like maintaining productivity and team connection. As companies adapt, many are embracing hybrid models that blend remote and in-person work.


# **Multi Document Q&A**

In [15]:
from transformers import pipeline
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
contexts =["Elon musk found SpaceX in 2002 to revolutionize space technology.",
           "Jeff bezos founded amazon in 1994 as an online bookstore"
]
question = "When did Jeff bezos founded amazon?"
for context in contexts:
  answer = qa(question = question, context = context)
  print(f"Context: {context}")
  print(f"Answer: {answer['answer']}")
  print()


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Context: Elon musk found SpaceX in 2002 to revolutionize space technology.
Answer: 2002

Context: Jeff bezos founded amazon in 1994 as an online bookstore
Answer: 1994



# **Conversational Q&A**

In [17]:
from transformers import pipeline
# Use the 'question-answering' pipeline instead of 'conversational'
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
context = """Marie Curie was a pioneering physicist and chemist who discovered radioactivity,
won two Nobel Prizes, and became the first woman to teach at the Sorbonne.
Her work with radium and polonium revolutionized science and medicine, leaving a legacy that still radiates brilliance today."""
chat_history = []

def ask_question(question, context):
  # Pass the chat history to the question-answering pipeline
  result = qa(question = question, context = context, qa_pairwise=True, chat_history = chat_history)
  chat_history.append({'question': question, 'answer': result['answer']})
  return result['answer']

question = "Who won the Nobel Prizes?"
answer = ask_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")

# Example of asking a follow-up question
question2 = "What was her field of study?"
answer2 = ask_question(question2, context)
print(f"Question: {question2}")
print(f"Answer: {answer2}")

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Question: Who won the Nobel Prizes?
Answer: Marie Curie
Question: What was her field of study?
Answer: physicist and chemist


# AUDIO I/O
## **TEXT to SPEECH**

In [20]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed click-8.1.8 gTTS-2.5.4


In [24]:
from gtts import gTTS
import os
mytext = "HI, how are you, hope you are fine"
language = 'en'
myobj = gTTS(text=mytext, lang=language, slow=False)
myobj.save("welcome.mp3")
os.system("mpg321 welcome.mp3")
#

32512

In [25]:
from IPython.display import Audio
Audio("welcome.mp3")

# **Text to Speech in indian languages**

In [3]:
!pip install git+https://github.com/huggingface/parler-tts.git

Collecting git+https://github.com/huggingface/parler-tts.git
  Cloning https://github.com/huggingface/parler-tts.git to /tmp/pip-req-build-eo6nfd0j
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/parler-tts.git /tmp/pip-req-build-eo6nfd0j
  Resolved https://github.com/huggingface/parler-tts.git to commit d108732cd57788ec86bc857d99a6cabd66663d68
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting descript-audiotools@ git+https://github.com/descriptinc/audiotools (from parler_tts==0.2.2)
  Cloning https://github.com/descriptinc/audiotools to /tmp/pip-install-xr17_olo/descript-audiotools_c7ce595519e64ed2b95285b37a4d20eb
  Running command git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools /tmp/pip-install-xr17_olo/descript-audiotools_c7ce595519e64ed2b95285b37a4d20eb
  Resolved https://github.com/d

In [4]:

import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

prompt = "നിങ്ങൾ എനിക്കെന്നുമൊരു അത്ഭുതമായിരുന്നു"
description = "A female speaker with a British accent delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."

description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)

generation = model.generate(input_ids=description_input_ids.input_ids, attention_mask=description_input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("indic_tts_out.wav", audio_arr, model.config.sampling_rate)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.75G [00:00<?, ?B/s]

  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_dim": 8,
  "codebook_loss_weight": 1.0,
  "codebook_size": 1024,
  "commitment_loss_weight": 0.25,
  "decoder_hidden_si

generation_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/990 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



In [5]:
from IPython.display import Audio
Audio("indic_tts_out.wav")