# Import Libraries
**nltk**: For natural language processing tasks.

**pandas**: To manipulate and analyze structured data (e.g., text data in tabular form).

**sent_tokenize**: Tokenizes (splits) text into sentences.

**word_tokenize**: Tokenizes text into words.

**Counter**: Counts occurrences of elements in a list, useful for frequency analysis.

**re**: Provides support for regular expressions, used for pattern matching in text.

**punkt** - is a pre-trained model in NLTK for sentence tokenization. It's required to break text into sentences and words.

In [5]:
#IMPORT LIBRARIES :::

# # Install necessary libraries (if not already installed)
# !pip install nltk pandas

# # Import libraries
# import nltk
# nltk.download('punkt_tab', download_dir='/root/nltk_data')
# import pandas as pd
# from nltk.tokenize import sent_tokenize, word_tokenize
# from collections import Counter
# import re
# # import nltk
# # import re
# # from nltk.tokenize import sent_tokenize

# # Download necessary NLTK data
# nltk.download('punkt')


# Install necessary libraries (if not already installed)
!pip install nltk pandas

# Download necessary NLTK data
nltk.download('punkt')

# Import libraries
import nltk
nltk.download('punkt_tab', download_dir='/root/nltk_data')
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import re



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Setting up the foundation for text summarization

Efficiently loading a large dataset in chunks.
Inspecting the structure and quality of the data to understand the columns (id, article, highlights).
Preparing the data for further processing, such as tokenization and summarization.


In [2]:
# Load the dataset in chunks
file_path = "train.csv"  # Replace with the actual path
chunk_size = 1000  # Adjust chunk size as needed for available memory
chunks = pd.read_csv(file_path, chunksize=chunk_size, on_bad_lines='skip')

# Display information about the first chunk for verification
first_chunk = next(iter(chunks))
print(first_chunk.head())

pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  


In [10]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text.strip()

processed_chunks = []
try:
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, on_bad_lines='warn', encoding='utf-8'):
        chunk['cleaned_article'] = chunk['article'].apply(clean_text)
        chunk['sentences'] = chunk['cleaned_article'].apply(sent_tokenize)
        processed_chunks.append(chunk)
except Exception as e:
    print(f"Error occurred: {e}")

data = pd.concat(processed_chunks)
pd.set_option('display.max_colwidth', None)
print(data[['article', 'sentences']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

#Preparing the scoring foundation for summarization.
High sentence scores indicate that the sentence contains words frequently used in the document, making it potentially more important.

In [11]:
# # # Assign Scores to Sentences Based on Word Frequency
# We normalizing the scores
from nltk.probability import FreqDist

# Calculate normalized word frequencies
def calculate_word_frequencies(sentences):
    freq_dist = FreqDist()
    total_words = 0
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        freq_dist.update(tokens)
        total_words += len(tokens)
    # Normalize frequencies
    for word in freq_dist:
        freq_dist[word] /= total_words
    return freq_dist

# Assign scores to sentences using normalized frequencies
def assign_sentence_scores(sentences, freq_dist):
    return [sum(freq_dist.get(word, 0) for word in word_tokenize(sentence)) for sentence in sentences]

# Pre-calculate word frequencies across all chunks
freq_dist = calculate_word_frequencies([sentence for sublist in data['sentences'] for sentence in sublist])

# Assign normalized sentence scores
data['sentence_scores'] = data['sentences'].apply(lambda sentences: assign_sentence_scores(sentences, freq_dist))

# Display a snippet of the scores for verification
print(data[['sentences', 'sentence_scores']].head())


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        