In [1]:
import os
import re
from pathlib import Path
import pandas as pd

# Step 1. Data Loading


In [2]:
folder = Path('data')

text_files = sorted(folder.glob("*.txt"))

if not text_files:
    raise FileNotFoundError(f"No .txt files found in folder: {folder.resolve()}")

# Read all documents
documents = []
characters = []
for file in text_files:
    text = file.read_text(encoding="utf-8")
    documents.append(re.sub(r"\s+", " ", text).strip())
    characters.append(len(text))
data = {
    "Filename": [f.name for f in text_files],
    "Character Count": characters,
    "Preview (first 200 chars)": [text[:200].replace("\n", " ") for text in documents],
}

pd.DataFrame(data)

Unnamed: 0,Filename,Character Count,Preview (first 200 chars)
0,article1.txt,2814,"1. Introduction In recent years, music plagiar..."
1,article2.txt,4450,"Introduction Recently, the use of various type..."
2,article3.txt,4002,Music is an art whose production is very diffi...
3,article4.txt,3401,"Introduction Aided by the internet, plagiarism..."
4,article5.txt,2009,Rhythm Plagiarism A prominent example for rhyt...


# Step 2. Text Preprocessing

In [3]:
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /home/hacker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hacker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hacker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def clean_text(text: str) -> str:
    """
    Cleans a single sentence or small text chunk.
    Converts to lowercase, removes punctuation, numbers, and extra spaces.
    """
    text = text.lower()
    text = re.sub(r'\[[0-9]*\]', ' ', text)        # remove [1], [2], etc.
    text = re.sub(r'[^a-z\s]', ' ', text)          # keep only letters
    text = re.sub(r'\s+', ' ', text).strip()       # normalize spaces
    return text

def tokenize_and_lemmatize(text: str):
    """
    Splits text into sentences, tokenizes words,
    removes stopwords, and lemmatizes.
    Returns: list of lists (each inner list = words in sentence)
    """
    sentences = sent_tokenize(text)
    processed_sentences = []
    for sent in sentences:
        cleaned = clean_text(sent)

        tokens = word_tokenize(cleaned)
        tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

        if tokens:  
            processed_sentences.append(tokens)

    return processed_sentences


# === Example usage ===
preprocessed_docs = [tokenize_and_lemmatize(doc) for doc in documents]

for i, doc in enumerate(preprocessed_docs, start=1):
    print(f"\n--- article{i}.txt ---")
    print(f"Total sentences: {len(doc)}")
    print("First 3 preprocessed sentences:")
    for sent in doc[:3]:
        print(" ", sent)



--- article1.txt ---
Total sentences: 29
First 3 preprocessed sentences:
  ['introduction', 'recent', 'year', 'music', 'plagiarism', 'become', 'serious', 'issue', 'music', 'industry']
  ['growing', 'use', 'world', 'wide', 'web', 'revenue', 'loss', 'due', 'plagiarism', 'pirate', 'copy', 'escalating', 'exponentially']
  ['korea', 'estimated', 'billion', 'dollar']

--- article2.txt ---
Total sentences: 36
First 3 preprocessed sentences:
  ['introduction', 'recently', 'use', 'various', 'type', 'multimedia', 'data', 'image', 'video', 'audio', 'shown', 'explosive', 'growth', 'content', 'based', 'search', 'became', 'great', 'importance']
  ['successful', 'content', 'based', 'search', 'indexing', 'scheme', 'query', 'processing', 'scheme', 'key', 'issue', 'considered']
  ['despite', 'great', 'advance', 'audio', 'search', 'less', 'investigated', 'either', 'image', 'video', 'search']

--- article3.txt ---
Total sentences: 33
First 3 preprocessed sentences:
  ['music', 'art', 'whose', 'production

# Step 3. Extractive summarization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [6]:
sentences_per_doc = [
    [" ".join(tokens) for tokens in sent_list] 
    for sent_list in preprocessed_docs
]

all_sentences = [sent for doc in sentences_per_doc for sent in doc]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_sentences)

# Compute sentence importance = mean TF-IDF value per sentence
sentence_scores = np.array(tfidf_matrix.mean(axis=1)).ravel()

# Now split scores back by document
split_points = np.cumsum([len(doc) for doc in sentences_per_doc])
doc_scores = np.split(sentence_scores, split_points[:-1])

# --- Extract 3 top sentences per document ---
top_sentences_per_doc = []
for i, (sent_list, scores) in enumerate(zip(sentences_per_doc, doc_scores)):
    top_idx = np.argsort(scores)[-3:][::-1]  # top 3 by descending score
    top_sentences = [sent_list[j] for j in top_idx]
    top_sentences_per_doc.append(top_sentences)
    print(f"\n--- Top 3 sentences from article{i+1}.txt ---")
    for s in top_sentences:
        print(" ", s)

# --- Combine all top sentences (6 total) and rank globally ---
all_top_sentences = [s for doc in top_sentences_per_doc for s in doc]
all_top_matrix = vectorizer.transform(all_top_sentences)
all_top_scores = np.array(all_top_matrix.mean(axis=1)).ravel()

# Pick 6 overall best sentences
top6_idx = np.argsort(all_top_scores)[-6:][::-1]
final_summary = [all_top_sentences[i] for i in top6_idx]

print("\n=== Final 6-sentence Summary ===")
for i, s in enumerate(final_summary, start=1):
    print(f"{i}. {s}")



--- Top 3 sentences from article1.txt ---
  system receives input polyphonic music pcm data output information plagiarized music music title time etc
  growing use world wide web revenue loss due plagiarism pirate copy escalating exponentially
  respect melody korea entertainment law society conducted survey find people considered music plagiarism

--- Top 3 sentences from article2.txt ---
  introduction recently use various type multimedia data image video audio shown explosive growth content based search became great importance
  music plagiarism detection using melody database three step query processing provides fast search ability taking three step query processing method consists index searching window stitching post processing
  unlike previous system plagiarism detection system unique characteristic follows novel similarity model solves problem misjudgment supporting alignment well shifting similarity model

--- Top 3 sentences from article3.txt ---
  using mfcc entropy mean e

# Step 4. Abstractive summarization (advanced version)

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model & tokenizer
model_name = "t5-base"   # alternatives: "t5-base" or "facebook/bart-large-cnn"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Combine all documents into one long text
combined_text = "\n".join(documents)

# Optional: truncate to avoid token limit (T5-small max ≈512 tokens)
max_input_length = 512
inputs = tokenizer(
    "summarize: " + combined_text,
    return_tensors="pt",
    max_length=max_input_length,
    truncation=True
)

# Generate summary
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=150,        # output length
    min_length=60,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n=== Abstractive Summary ===\n")
print(summary)

  from .autonotebook import tqdm as notebook_tqdm
2025-10-24 11:32:41.011184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



=== Abstractive Summary ===

in recent years, music plagiarism has become a serious issue in the music industry . proposed music plagiarism detection system should be a welcome news to the music industry . system receives PCM data as a query and extracts melody from it . calculates melody similarity to the music in the database and retrieves the plagiarized music .


# Step 5. Evaluation of quality

In [8]:
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [9]:
def evaluate_summary(original_texts, extractive_summary, abstractive_summary):
    joined_original = " ".join(original_texts)

    # --- ROUGE scores (abstractive vs original) ---
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_extractive = scorer.score(joined_original, extractive_summary)
    rouge_abstractive = scorer.score(joined_original, abstractive_summary)

    # --- Cosine similarity using TF-IDF ---
    vectorizer = TfidfVectorizer().fit([joined_original, extractive_summary, abstractive_summary])
    vectors = vectorizer.transform([joined_original, extractive_summary, abstractive_summary])
    cos_sim_matrix = cosine_similarity(vectors)

    print("=== Automatic Evaluation ===")
    print("\nROUGE-1 / ROUGE-2 / ROUGE-L")
    print("Extractive:", rouge_extractive)
    print("Abstractive:", rouge_abstractive)
    print("\nCosine similarity with original:")
    print(f"Extractive: {cos_sim_matrix[0,1]:.3f}")
    print(f"Abstractive: {cos_sim_matrix[0,2]:.3f}")

evaluate_summary(documents, " ".join(final_summary), summary)


=== Automatic Evaluation ===

ROUGE-1 / ROUGE-2 / ROUGE-L
Extractive: {'rouge1': Score(precision=0.986013986013986, recall=0.05276946107784431, fmeasure=0.10017761989342805), 'rouge2': Score(precision=0.5, recall=0.026581804567577687, fmeasure=0.05047991468183435), 'rougeL': Score(precision=0.5594405594405595, recall=0.029940119760479042, fmeasure=0.056838365896980464)}
Abstractive: {'rouge1': Score(precision=1.0, recall=0.020209580838323353, fmeasure=0.03961848862802641), 'rouge2': Score(precision=0.9433962264150944, recall=0.018719580681392737, fmeasure=0.03671071953010279), 'rougeL': Score(precision=1.0, recall=0.020209580838323353, fmeasure=0.03961848862802641)}

Cosine similarity with original:
Extractive: 0.175
Abstractive: 0.628
