In [1]:
from summa import summarizer
from summa import keywords

def text_rank_summarize(text, ratio=0.2):
    summarized_text = summarizer.summarize(text, ratio=ratio)
    return summarized_text

def text_rank_extract_keywords(text, ratio=0.2):
    extracted_keywords = keywords.keywords(text, ratio=ratio).split('\n')
    return extracted_keywords

# input from the user without gui
input_text = input("Enter your input text: ")

# Summarization
summary = text_rank_summarize(input_text)
print("Summary:")
print(summary)
print()

# Keyword extraction
keywords = text_rank_extract_keywords(input_text)
print("Extracted Keywords:")
print(keywords)


Summary:
'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding  on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking?
Tests conducted by the FAA use planes with a  31 inch pitch, a standard which on some airlines has decreased .
But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News.

Extracted Keywords:
['inch', 'inches', 'seat', 'plane seats', 'airlines', 'planes', 'air', 'humans', 'humane', 'human', 'said', 'consumer', 'aviation', 'tests', 'lockers crashing', 'offer', 'offers', 'set']


In [None]:
import re
from collections import Counter
import math

def get_cosine_similarity(text1, text2):
    # Preprocess texts by converting them to lowercase and removing non-alphanumeric characters
    text1 = re.sub(r'\W+', ' ', text1.lower())
    text2 = re.sub(r'\W+', ' ', text2.lower())

    # Tokenize texts into words
    words1 = text1.split()
    words2 = text2.split()

    # Count word frequencies in each text
    word_freq1 = Counter(words1)
    word_freq2 = Counter(words2)

    # Get the set of all unique words
    all_words = set(words1).union(set(words2))

    # Calculate the dot product of word frequencies
    dot_product = sum(word_freq1[word] * word_freq2[word] for word in all_words)

    # Calculate the Euclidean lengths of word frequencies
    length1 = math.sqrt(sum(word_freq1[word] ** 2 for word in all_words))
    length2 = math.sqrt(sum(word_freq2[word] ** 2 for word in all_words))

    # Calculate the cosine similarity score
    cosine_similarity = dot_product / (length1 * length2)

    return cosine_similarity

# Example usage
text1 = "This is the first text."
text2 = "This is the second text."

similarity_score = get_cosine_similarity(text1, text2)
print("Similarity score:", similarity_score)


In [2]:
import pandas as pd
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from sumy.summarizers.text_rank import TextRankSummarizer
import random 

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 7.5 MB/s  eta 0:00:01
[?25hCollecting requests>=2.7.0
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 3.3 MB/s  eta 0:00:01
[?25hCollecting pycountry>=18.2.23
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 47.9 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting docopt<0.7,>=0.6.1
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting breadability>=0.1.20
  Downloading breadability-0.1.20.tar.gz (32 kB)
Collecting nltk>=3.0.2
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 9.9 MB/s eta 0:00:01
[?25hCollecting chardet
  Downloading chardet-5.1.0-py3-none-any.whl (199 k



In [None]:
# Read Dataframe
df = pd.read_csv('../data/BBC_News_Summary/')

# Show Dataframe
df.head()

In [None]:
# Initialize the summarizers
text_rank_summarizer = TextRankSummarizer()
lsa_summarizer = LsaSummarizer()
lex_rank_summarizer = LexRankSummarizer()

# Define the compression rates
compression_rates = [0.1, 0.3, 0.5, 0.7]  # Adjust the values as desired

# Initialize counter
approach_counts = {summarizer.__class__.__name__: 0 for summarizer in [text_rank_summarizer, lsa_summarizer, lex_rank_summarizer]}
compression_rate_counts = {rate: 0 for rate in compression_rates}

# Iterate over dataframe
for index, row in df.iterrows():
    approach = random.choice([text_rank_summarizer, lsa_summarizer, lex_rank_summarizer])
    compression_rate = random.choice(compression_rates)
    
    # Update counter
    approach_counts[approach.__class__.__name__] += 1
    compression_rate_counts[compression_rate] += 1

    # Tokenize the text
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(row['Text'])

    # Calculate the number of sentences for the chosen compression rate
    num_sentences = int(len(sentences) * compression_rate)

    # Generate the summary
    summarizer = approach
    parser = PlaintextParser.from_string(row['Text'], tokenizer)
    summary = summarizer(parser.document, num_sentences)

    # Store the summary in the "Summary" column
    df.loc[index, 'Summary'] = ' '.join(str(sentence) for sentence in summary)

# Calculate total summaries
total_summaries = len(df)

# Calculate the percentages
approach_percentages = {approach: (count / total_summaries) * 100 for approach, count in approach_counts.items()}
compression_rate_percentages = {rate: (count / total_summaries) * 100 for rate, count in compression_rate_counts.items()}

# Print the results
for approach, percentage in approach_percentages.items():
    print(f"The approach {approach} was used for {percentage:.2f}% of the summaries.")

for rate, percentage in compression_rate_percentages.items():
    print(f"The compression rate {rate} was used for {percentage:.2f}% of the summaries.")