## Importing modules 

In [75]:
# Import necessary libraries for text clustering
# Import stopwords to remove common words that may not be useful for clustering
from nltk.corpus import stopwords 
# Import cosine_distance function to calculate cosine distance between vectors
from nltk.cluster.util import cosine_distance  
# Import numpy for numerical operations
import numpy as np  
# Import networkx for graph-based operations
import networkx as nx 


## Text 1

## Text Preprocessing for Japanese Text 

In [76]:
import re

file_path = "C:\\Users\\pbhar\\Downloads\\Japaneese lang.txt"

# Open the file with UTF-8 encoding
with open(file_path, "r", encoding="utf-8") as file:
    # Read the file contents
    filedata = file.read()

# Define the regex pattern to match Japanese sentence terminators
regex_pattern = r"。|？|！"

# Split the text into sentences using the regex pattern
sentences = re.split(regex_pattern, filedata)

# Clean up empty sentences and remove leading/trailing spaces
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Print the sentences
for sentence in sentences:
    print(sentence)


﻿受肉した御言葉の中心は、神の救い主が永遠の御父と全人類を絶え間なく愛する三重の愛の主なしるしであり象徴であると、当然かつ正当に考えられています
それは、神が御父と聖霊と分かち合っている神の愛の象徴ですが、言葉が受肉した神だけが、弱く朽ちる体を通して現されます
なぜなら、「神の完全性が神の内に肉体的に宿っている」からです
さらに、それは彼の魂に注入され、キリストの人間の意志を豊かにし、至福のビジョンと直接注入されたものの両方から得られる最も完全な知識によってその行為を啓発し、統治する、その燃えるような愛の象徴です


## list of sentences

In [77]:
print("Sentences are", sentence)

Sentences are さらに、それは彼の魂に注入され、キリストの人間の意志を豊かにし、至福のビジョンと直接注入されたものの両方から得られる最も完全な知識によってその行為を啓発し、統治する、その燃えるような愛の象徴です


## Function to caluculate similarity

In [78]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Creating the similarity Matrix

In [79]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.61026443 0.50311529 0.60332514]
 [0.61026443 0.         0.46556694 0.48323417]
 [0.50311529 0.46556694 0.         0.5247782 ]
 [0.60332514 0.48323417 0.5247782  0.        ]]


## Ranking sentences in Similarity matrix

In [80]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.2668451007030116, 1: 0.24495359805436812, 2: 0.23597296876557858, 3: 0.2522283324770418}


## Sort Sentences by pagerank

In [81]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.2668451007030116, '\ufeff受肉した御言葉の中心は、神の救い主が永遠の御父と全人類を絶え間なく愛する三重の愛の主なしるしであり象徴であると、当然かつ正当に考えられています'), (0.2522283324770418, 'さらに、それは彼の魂に注入され、キリストの人間の意志を豊かにし、至福のビジョンと直接注入されたものの両方から得られる最も完全な知識によってその行為を啓発し、統治する、その燃えるような愛の象徴です'), (0.24495359805436812, 'それは、神が御父と聖霊と分かち合っている神の愛の象徴ですが、言葉が受肉した神だけが、弱く朽ちる体を通して現されます'), (0.23597296876557858, 'なぜなら、「神の完全性が神の内に肉体的に宿っている」からです')]


## Pick the top 'n' sentences

In [82]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  1


## summarization

In [83]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 ﻿ 受 肉 し た 御 言 葉 の 中 心 は 、 神 の 救 い 主 が 永 遠 の 御 父 と 全 人 類 を 絶 え 間 な く 愛 す る 三 重 の 愛 の 主 な し る し で あ り 象 徴 で あ る と 、 当 然 か つ 正 当 に 考 え ら れ て い ま す


## Text 2

## Text Preprocessing for Polish Text 

In [84]:
import re

file_path = r"C:\Users\pbhar\Jupiter files\Polish.txt"
with open(file_path, "r") as file:
    filedata = file.readline()  # read only the first line

article = filedata.split(". ")  # Split into sentences

sentences = []
for sentence in article:
    print(sentence)
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # Remove non-alphabetic characters
    sentences.append(cleaned_sentence.split())  # Split into words


ï»¿Tradycyjnie firmy wykorzystywaÅ‚y swojÄ… obecnoÅ›Ä‡ w sklepach stacjonarnych, aby zrozumieÄ‡ swoich klientÃ³w â€“ jak ich przyciÄ…gnÄ…Ä‡, zaangaÅ¼owaÄ‡ i zachwyciÄ‡.



## list of sentences

In [85]:
print("Sentences are ", sentences)

Sentences are  [['Tradycyjnie', 'firmy', 'wykorzystyway', 'swoj', 'obecno', 'w', 'sklepach', 'stacjonarnych', 'aby', 'zrozumie', 'swoich', 'klientw', 'jak', 'ich', 'przycign', 'zaangaowa', 'i', 'zachwyci']]


## Function tom Caluculate the similarity

In [86]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Creating Similarity Matrix 

In [87]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.]]


## Ranking Sentences in Similarity Matrix

In [88]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 1.0}


## Sorting Sentences by PageRank 

In [89]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(1.0, ['Tradycyjnie', 'firmy', 'wykorzystyway', 'swoj', 'obecno', 'w', 'sklepach', 'stacjonarnych', 'aby', 'zrozumie', 'swoich', 'klientw', 'jak', 'ich', 'przycign', 'zaangaowa', 'i', 'zachwyci'])]


## Pick the top sentences

In [90]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []

for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  1


## Summarization

In [91]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 Tradycyjnie firmy wykorzystyway swoj obecno w sklepach stacjonarnych aby zrozumie swoich klientw jak ich przycign zaangaowa i zachwyci


## Text 3

## Text Preprocessing for Slovak Text

In [92]:
file_path = r"C:\Users\pbhar\Jupiter files\Slovak.txt"
with open(file_path, "r", encoding="utf-8") as file:
    filedata = file.readline()  # read only the first line

article = filedata.split(". ")  # Split into sentences

sentences = []
for sentence in article:
    print(sentence)
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # Remove non-alphabetic characters
    sentences.append(cleaned_sentence.split())  # Split into words


﻿Firmy tradične využívajú svoju prítomnosť v kamenných predajniach, aby pochopili svojich zákazníkov – ako ich prilákať, zaujať a potešiť.



## list of sentences

In [93]:
print("Sentences are ", sentences)

Sentences are  [['Firmy', 'tradine', 'vyuvaj', 'svoju', 'prtomnos', 'v', 'kamennch', 'predajniach', 'aby', 'pochopili', 'svojich', 'zkaznkov', 'ako', 'ich', 'prilka', 'zauja', 'a', 'potei']]


## Function Calculate the similarity

In [94]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)
    

## Creating the similarity Matrix

In [95]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.]]


## Ranking Sentences in Similarity Matrix

In [96]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 1.0}


## Sorting Sentences by pageRank

In [97]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(1.0, ['Firmy', 'tradine', 'vyuvaj', 'svoju', 'prtomnos', 'v', 'kamennch', 'predajniach', 'aby', 'pochopili', 'svojich', 'zkaznkov', 'ako', 'ich', 'prilka', 'zauja', 'a', 'potei'])]


## Pick the Top Sentences

In [99]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  1


## Summarization

In [100]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 Firmy tradine vyuvaj svoju prtomnos v kamennch predajniach aby pochopili svojich zkaznkov ako ich prilka zauja a potei
