In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
# Sample data containing the text to be summarized
data = ["T-SR is an extractive summarization method based on sentence regression.",
        "It uses sentence length, position, and TF/IDF information as features.",
        "The regression model is trained using a set of reference summaries.",
        "The model selects the most important sentences to form the summary."]

In [27]:
# Convert text data into a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)

In [28]:
# Calculate sentence features
n_sentences = X.shape[0]
sentence_length = np.zeros(n_sentences) #Return a new array of given shape and type, filled with zeros.
sentence_position = np.zeros(n_sentences) #Return a new array of given shape and type, filled with zeros
for i in range(n_sentences):
    sentence_length[i] = len(data[i])
    sentence_position[i] = i / float(n_sentences)

In [29]:
# Combine sentence features into a feature matrix
X_features = np.zeros((n_sentences, 3))
X_features[:, 0] = sentence_length
X_features[:, 1] = sentence_position
X_features[:, 2] = X.max(axis=1).toarray().flatten()

In [36]:
# Train linear regression model on reference summaries
y = np.array([1, 0, 1, 0])  # Binary labels indicating if sentence is important or not
model = LinearRegression()
model.fit(X_features, y)

In [37]:
# Predict importance of each sentence and generate summary
scores = model.predict(X_features)
summary_idx = np.argsort(scores)[::-1][:2]  # Select two most important sentences
summary = [data[i] for i in summary_idx]
np.argsort(scores)[1]

3

In [38]:
# Print summary
print("Summary:")
for sentence in summary:
    print("- " + sentence)

Summary:
- T-SR is an extractive summarization method based on sentence regression.
- The regression model is trained using a set of reference summaries.
