# Summarizer Model Training
This notebook trains a TF-IDF based extractive text summarization model.

In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Sample training text (you can replace this with a larger dataset)
sample_text = """Artificial Intelligence (AI) is intelligence demonstrated by machines.
It contrasts with the natural intelligence displayed by humans and animals.
AI is used in various applications such as robotics, computer vision, and natural language processing.
The goal of AI is to create systems that can function intelligently and independently.
Machine learning is a subset of AI that enables computers to learn from data.
"""

# Tokenize text into sentences
sentences = sent_tokenize(sample_text)
print("Sentences:", sentences)

Sentences: ['Artificial Intelligence (AI) is intelligence demonstrated by machines.', 'It contrasts with the natural intelligence displayed by humans and animals.', 'AI is used in various applications such as robotics, computer vision, and natural language processing.', 'The goal of AI is to create systems that can function intelligently and independently.', 'Machine learning is a subset of AI that enables computers to learn from data.']


In [5]:
# Create TF-IDF vectorizer and fit on sentences
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)
import os

# Create the directory if it doesn't exist
os.makedirs("saved_model", exist_ok=True)

# Save the trained vectorizer
with open("saved_model/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
print("Vectorizer saved to TASK1/saved_model/vectorizer.pkl")

Vectorizer saved to TASK1/saved_model/vectorizer.pkl


In [7]:
import pickle

with open("saved_model/vectorizer.pkl", "rb") as f:
    loaded_vectorizer = pickle.load(f)

print("Vocabulary loaded from saved vectorizer:")
print(loaded_vectorizer.vocabulary_)


Vocabulary loaded from saved vectorizer:
{'artificial': 3, 'intelligence': 16, 'ai': 0, 'demonstrated': 9, 'machines': 22, 'contrasts': 6, 'natural': 23, 'displayed': 10, 'humans': 14, 'animals': 1, 'used': 28, 'various': 29, 'applications': 2, 'robotics': 25, 'computer': 4, 'vision': 30, 'language': 18, 'processing': 24, 'goal': 13, 'create': 7, 'systems': 27, 'function': 12, 'intelligently': 17, 'independently': 15, 'machine': 21, 'learning': 20, 'subset': 26, 'enables': 11, 'computers': 5, 'learn': 19, 'data': 8}
