# DEPENDENCIES

In [34]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
from nltk.corpus import words
from nostril import nonsense
from tqdm.auto import tqdm
from openai import OpenAI
from io import BytesIO
from PIL import Image
import pandas as pd
import pytesseract
import requests
import random
import math
import nltk
import os
import re

# FUNCTIONS

In [120]:
def sort_chapters(data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    chapters = []
    for item in data['chapter']:
        name = item.split('-')
        if len(name) > 2:
            new_name = name[1] + '.' + name[-1]
        else:
            new_name = name[-1]
        chapters.append(float(new_name))

    for i in range(len(chapters)):
        data['chapter'][i] = chapters[i]

    data = data.sort_values(by='chapter')
    
    return data


def extract_text_from_image(image_url: str) -> str:
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))
        text = pytesseract.image_to_string(image)

        return text

    except Exception as e:
        return f"Error: {str(e)}"


def text_formatter(text: str) -> str:
    text = text.split('\n\n')
    new_text = []
    for i in text:
        new_text.append(i.replace('\n', ' '))
    return new_text


def clean_text(txt: list[list[list[str]]]) -> list[list[list[str]]]:
    unwanted_chars = [
    ' ', '  ', '?', '!', '...', '.', ':', ',', '-', '_', '™', '“', '”', '—', "'", '"', '¢'
    ]

    for idx_1, sentences_list in tqdm(enumerate(txt)):
        for idx_2, sentences in enumerate(sentences_list):
            for idx_3, sentence in enumerate(sentences):
                for char in unwanted_chars:
                    sentence = sentence.replace(char, '')
                
                # handle short sentences
                if len(sentence) < 7:
                    chars = len(sentence)
                    difference = 8 - chars
                    new_text = sentence + ('a' * difference)
                    if nonsense(new_text):
                        txt[idx_1][idx_2].pop(idx_3)
                else:
                    if nonsense(sentence):
                        txt[idx_1][idx_2].pop(idx_3)

    return txt


def extract_and_format_text(dataframe: pd.core.frame.DataFrame) -> list[list[list[str]]]:
    webp = []
    for item in dataframe['chapters']:
        webp.append(item)
        
    symbols_to_remove = ['@', '#', '$', 
                        '&', '€', '=', 
                        ':', '(', ')', 
                        '{', '}', '|', 
                        '+', '~', '/', 
                        '`', '‘', '\\', 
                        '>', '<', '™', 
                        '©', '®', '[', 
                        ']', '¥', '*', 
                        '»', '%']
    
    pattern = "[" + re.escape("".join(symbols_to_remove)) + "]"

    text = []
    temp_text = []

    # extractor
    for lists in tqdm(webp):
        # loop through each webpage per chapter
        for img in lists:
            raw_text = extract_text_from_image(img)
            temp_text.append(text_formatter(raw_text))
        text.append(temp_text)
        temp_text = []

    # formatter
    for idx_1, k in enumerate(text):
        for idx_2, i in enumerate(k):
            for idx_3, j in enumerate(i):
                text[idx_1][idx_2][idx_3] = re.sub(pattern, '', text[idx_1][idx_2][idx_3])
                text[idx_1][idx_2][idx_3] = re.sub(r'\d', '', text[idx_1][idx_2][idx_3])

    # cleaner
    for idx_1, lists in tqdm(enumerate(text)):
        for idx_2, sentences in enumerate(lists):
            for idx_3, sentence in enumerate(sentences):
                if 'Asura' in sentence or 'asura' in sentence or 'recruiting' in sentence:
                    text[idx_1][idx_2].pop(idx_3)
                if sentence.isspace() == True or sentence == '':
                    text[idx_1][idx_2].pop(idx_3)

    # eliminator
    for idx_1, k in tqdm(enumerate(text)):
        for idx_2, i in enumerate(k):
            if not i:
                text[idx_1].pop(idx_2)

    text = clean_text(text)

    return text


def insert_text(webp_text: list[list[list[str]]], 
                            dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    dataframe = dataframe.assign(text=webp_text)

    return dataframe


def is_sensible_string(s: str) -> bool:
    english_words = set(words.words())
    tokens = nltk.word_tokenize(s)
    valid_word_count = sum(1 for token in tokens if token.lower() in english_words)

    return valid_word_count / max(len(tokens), 1) >= 0.4


def filter_sensible_strings(strings: str) -> list[str]:
    return [s for s in strings if is_sensible_string(s)]


def polish_text(sample_text: list[list[list[str]]]) -> list[list[list[str]]]:
    clean_text = []
    temp_clean_text = []

    for lists in tqdm(sample_text):
        for sentences in lists:
            temp_clean_text.append(filter_sensible_strings(sentences))
        clean_text.append(temp_clean_text)
        temp_clean_text = []

    for idx_1, k in tqdm(enumerate(clean_text)):
        for idx_2, i in enumerate(k):
            if not i:
                clean_text[idx_1].pop(idx_2)

    return clean_text


def get_chapter_summary(chapters_text: list[list[list[str]]]) -> list[str]:
    corpus = []
    temp_corpus = ''
    summaries = []

    API_KEY = os.environ.get('OPENAI_APIKEY')
    client = OpenAI(api_key=API_KEY)

    for content in chapters_text:
        for sentences in content:
            for sentence in sentences:
                temp_corpus += sentence
        corpus.append(temp_corpus)
        temp_corpus = ''

    for chapter in tqdm(corpus):
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "you are a professional manga story chapter summarizer, which uses the characters dialogue to summarize each chapter. Use the chapter dialogues provided by the user to summarize the chapter story"},
                {
                    "role": "user",
                    "content": chapter
                }
            ]
        )

        summary = completion.choices[0].message.content
        summary = summary.replace("\"", '')
        summary = summary.replace("\n", '')
        summaries.append(summary.replace("\n\n", ''))

    return summaries


def get_summary_whole(corpus: str, client: OpenAI) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "you are a professional manga story summarizer, which uses the dialogue in each chapter to summarize the whole manga. Use the chapters dialogues provided by the user to summarize the whole story"},
            {
                "role": "user",
                "content": corpus
            }
        ]
    )

    summary = completion.choices[0].message.content
    summary = summary.replace("\"", '')
    summary = summary.replace("\n", '')
    summary = summary.replace("\n\n", '')

    return summary


def get_summary_partial(corpus: str, client: OpenAI, rep: int, end_point: int) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"you are a professional manga story summarizer, use the dialogue in each chapter to summarize the whole manga section. This is section {rep + 1} of {end_point} sections."},
            {
                "role": "user",
                "content": corpus
            }
        ]
    )

    summary = completion.choices[0].message.content
    summary = summary.replace("\"", '')
    summary = summary.replace("\n", '')
    summary = summary.replace("\n\n", '')

    return summary


def get_summary_from_partial(corpus: str, client: OpenAI, division: int) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"you are a professional manga story summarizer, which uses the summary of different manga sections to summarize the whole manga. Use the summary of {division} sections to summarise the whole story"},
            {
                "role": "user",
                "content": corpus
            }
        ]
    )

    summary = completion.choices[0].message.content
    summary = summary.replace("\"", '')
    summary = summary.replace("\n", '')
    summary = summary.replace("\n\n", '')

    return summary


def separator_machine(end: int, count: int, separator: int, elements: list) -> list:
    next_step = (separator * count) + separator

    if next_step > end:
        result = elements[(separator * count):]
    else:
        result = elements[(separator * count):next_step]
    
    return result


def extract_corpus(clean_text: list[list[list[str]]], mode: int) -> str:
    corpus = ''
    if mode == 0:
        for web_p, content in enumerate(clean_text):
            title = f' chapter {web_p + 1} - '
            corpus += title
            for sentences in content:
                for sentence in sentences:
                    corpus += sentence
    else:
        for web_p, content in enumerate(clean_text):
            title = f'new chapter - '
            corpus += title
            for sentences in content:
                for sentence in sentences:
                    corpus += sentence
    
    return corpus


def get_summary(clean_text: list[list[list[str]]]) -> str:
    API_KEY = os.environ.get('OPENAI_APIKEY')
    client = OpenAI(api_key=API_KEY)

    corpus = extract_corpus(clean_text, 0)

    # check for corpus tokens
    token_limit = 100000 # 128000
    corpus_tokens = len(corpus) / 3.06

    if corpus_tokens > token_limit:
        # calculates the number of batches to split the list
        div = (corpus_tokens / token_limit) + 1
        div = math.ceil(div)

        # separates the list of sentences
        separator = len(clean_text) // div

        split_list = []
        divided_corpus = []
        summary_aggregate = ''
        separation_count = 0

        for i in range(div):
            # first element 
            if i == 0:
                split_list.append(clean_text[:separator])
                separation_count += separator

            # last and in between elements
            else:
                split_list.append(separator_machine(len(clean_text), i, separator, clean_text))
                separation_count += separator

        # got the split list, now get each corpus
        for lists in split_list:
            divided_corpus.append(extract_corpus(lists, 1))

        # got the split corpus, now let's make the summary
        for i in tqdm(range(div)):
            summary_aggregate += get_summary_partial(divided_corpus[i], client, i, div)

        # finally
        return get_summary_from_partial(summary_aggregate, client, div)
    else:
        return get_summary_whole(corpus, client)

# TOP 5 MANGAS (with less than 120 chapters)

## START THE SQL ENGINE

This cell was supposed to find 5 popular manga titles, but the database was altered during the development of the code. This cell currently does not return anything, but the titles found with these criterias were:

* Talent-Swallowing Magician
* Return of the Disaster-Class Hero
* The Extra’s Academy Survival Guide
* Dragon-Devouring Mage
* Regressor Instruction Manual

In [125]:
load_dotenv()
url = os.environ.get('URL')
engine = create_engine(url=url)

popular = ''' 
SELECT title, total_views, chapter_number
FROM series
WHERE total_views > 50 AND chapter_number::integer < 120
ORDER BY total_views DESC
LIMIT 5;
'''

top_df = pd.read_sql_query(popular, engine)
top_df

Unnamed: 0,title,total_views,chapter_number


In [102]:
top_5_mangas = ['Talent-Swallowing Magician',
                'Return of the Disaster-Class Hero',
                'The Extra’s Academy Survival Guide',
                'Dragon-Devouring Mage',
                'Regressor Instruction Manual']

## MANGA NO.1

In [126]:
# top = top_df['title'][0]

first = f'''
SELECT id, title, chapter, chapters
FROM chapters 
WHERE title = '{top_5_mangas[0]}';'''

mp1_df = pd.read_sql_query(first, engine)
mp1_df.head()

Unnamed: 0,id,title,chapter,chapters
0,11f2e10a-1b2d-4d72-ad56-0c5132df0844,Talent-Swallowing Magician,chapter-50,[https://cnn.kadkomi.com/komic/asura-scans/tal...
1,02583918-d601-41cc-8501-aea97d1cef0d,Talent-Swallowing Magician,chapter-38,[https://cnn.kadkomi.com/komic/asura-scans/tal...
2,28d6afe0-7702-4f6c-9356-f5d996823efc,Talent-Swallowing Magician,chapter-60,[https://cnn.kadkomi.com/komic/asura-scans/tal...
3,077a2a4c-29f6-474f-bb4a-5c683864e12e,Talent-Swallowing Magician,chapter-62,[https://cnn.kadkomi.com/komic/asura-scans/tal...
4,6ce3998f-b9cf-4ace-a1a8-5204d1d0ffc9,Talent-Swallowing Magician,chapter-63,[https://cnn.kadkomi.com/komic/asura-scans/tal...


In [127]:
mp1_df = sort_chapters(mp1_df)
mp1_df.head()

Unnamed: 0,id,title,chapter,chapters
19,a4948ddb-da17-463b-a0ce-ddca95205e39,Talent-Swallowing Magician,0.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...
37,be6406c0-1aa6-4385-9bde-b9886e91ff90,Talent-Swallowing Magician,1.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...
38,00f61f40-f463-4b8e-8376-b4d761fac0bd,Talent-Swallowing Magician,2.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...
35,e681d7bf-8abd-4d42-b80d-0e84a9e2793c,Talent-Swallowing Magician,3.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...
73,0ad93648-bce3-45bf-8925-6995ba65b51b,Talent-Swallowing Magician,4.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...


In [129]:
text_1 = extract_and_format_text(mp1_df)

100%|██████████| 101/101 [50:06<00:00, 29.77s/it]
101it [00:00, 39315.52it/s]
101it [00:00, 523639.93it/s]
101it [00:00, 275.98it/s]


In [130]:
clean_text_1 = polish_text(text_1)

100%|██████████| 101/101 [10:25<00:00,  6.20s/it]
101it [00:00, 668177.77it/s]


In [131]:
mp1_df = insert_text(clean_text_1, mp1_df)
mp1_df.head()

Unnamed: 0,id,title,chapter,chapters,text
19,a4948ddb-da17-463b-a0ce-ddca95205e39,Talent-Swallowing Magician,0.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [THE MASSIVE CO..."
37,be6406c0-1aa6-4385-9bde-b9886e91ff90,Talent-Swallowing Magician,1.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [THE DISEASE TH..."
38,00f61f40-f463-4b8e-8376-b4d761fac0bd,Talent-Swallowing Magician,2.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [iw Npe a rp ce..."
35,e681d7bf-8abd-4d42-b80d-0e84a9e2793c,Talent-Swallowing Magician,3.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[AREN’T YOU ANGRY?!, MY EAR IS ABOUT TO FALL ..."
73,0ad93648-bce3-45bf-8925-6995ba65b51b,Talent-Swallowing Magician,4.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [], [IT... IT R..."


## MANGA NO.2

In [26]:
top_2 = top_df['title'][1]

second = f'''
SELECT id, title, chapter, chapters
FROM chapters 
WHERE title = '{top_2}';'''

mp2_df = pd.read_sql_query(second, engine)
mp2_df.head()

Unnamed: 0,id,title,chapter,chapters
0,28d160d5-b661-4de5-9619-57fef521f4f2,Return of the Disaster-Class Hero,chapter-82,[https://cnn.kadkomi.com/komic/asura-scans/ret...
1,031202ef-4e93-45f5-897b-9b1e08dc195e,Return of the Disaster-Class Hero,chapter-97,[https://cnn.kadkomi.com/komic/asura-scans/ret...
2,643025c7-228b-4a30-a2e8-7fd82fe770c4,Return of the Disaster-Class Hero,chapter-98,[https://cnn.kadkomi.com/komic/asura-scans/ret...
3,baecff82-c78c-470c-a1b2-c7b0cf2c2bde,Return of the Disaster-Class Hero,chapter-48-5,[https://cnn.kadkomi.com/komic/asura-scans/ret...
4,394c3099-f11e-403b-8b13-ccbb0a32abf9,Return of the Disaster-Class Hero,chapter-99,[https://cnn.kadkomi.com/komic/asura-scans/ret...


In [27]:
mp2_df = sort_chapters(mp2_df)
mp2_df.head()

Unnamed: 0,id,title,chapter,chapters
52,b0b6a9a7-2935-4a56-aba2-fc6686d1db1c,Return of the Disaster-Class Hero,0.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...
100,323caf73-31d1-4298-afbc-1c9e0356a56c,Return of the Disaster-Class Hero,1.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...
101,25ef45e3-32ea-4c9b-80b1-0dea4dee9b75,Return of the Disaster-Class Hero,2.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...
5,a87db559-e89a-4924-ad31-dfe083b482bc,Return of the Disaster-Class Hero,3.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...
70,43282d73-7909-4703-8e65-6a307b5bf65e,Return of the Disaster-Class Hero,4.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...


In [33]:
text_2 = extract_and_format_text(mp2_df)

100%|██████████| 103/103 [1:04:24<00:00, 37.52s/it]
103it [00:00, 35237.63it/s]
103it [00:00, 470090.66it/s]
103it [00:00, 243.66it/s]


In [36]:
clean_text_2 = polish_text(text_2)

100%|██████████| 103/103 [11:45<00:00,  6.85s/it]
103it [00:00, 499437.35it/s]


In [37]:
mp2_df = insert_text(clean_text_2, mp2_df)
mp2_df.head()

Unnamed: 0,id,title,chapter,chapters,text
52,b0b6a9a7-2935-4a56-aba2-fc6686d1db1c,Return of the Disaster-Class Hero,0.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...,"[[ RETURN OF THE CHAPTER DISASTER-CLASS HERO, ..."
100,323caf73-31d1-4298-afbc-1c9e0356a56c,Return of the Disaster-Class Hero,1.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...,"[[ RETURN OF THE CHAPTER DISASTER-CLASS HERO],..."
101,25ef45e3-32ea-4c9b-80b1-0dea4dee9b75,Return of the Disaster-Class Hero,2.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...,"[[ RETURN OF THE CHAPTER DISASTER-CLASS HERO, ..."
5,a87db559-e89a-4924-ad31-dfe083b482bc,Return of the Disaster-Class Hero,3.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...,"[[a RETURN OF THE CHAPTER — CLASS HERO, TYPESE..."
70,43282d73-7909-4703-8e65-6a307b5bf65e,Return of the Disaster-Class Hero,4.0,[https://cnn.kadkomi.com/komic/asura-scans/ret...,"[[ RETURN OF THE CHAPTER DISASTER-CLASS HERO, ..."


## MANGA NO.3

In [40]:
top_3 = top_df['title'][2]

third = f'''
SELECT id, title, chapter, chapters
FROM chapters 
WHERE title = '{top_3}';'''

mp3_df = pd.read_sql_query(third, engine)
mp3_df.head()

Unnamed: 0,id,title,chapter,chapters
0,00692eb1-809e-4d9b-99d6-c83010049856,The Extra’s Academy Survival Guide,chapter-48,[https://cnn.kadkomi.com/komic/asura-scans/the...
1,921d17e7-bc15-4fef-a979-cd270d3ba013,The Extra’s Academy Survival Guide,chapter-50,[https://cnn.kadkomi.com/komic/asura-scans/the...
2,6ae368d9-3a92-4211-9de5-65041a8ed730,The Extra’s Academy Survival Guide,chapter-49,[https://cnn.kadkomi.com/komic/asura-scans/the...
3,351f305a-826a-4bf2-ba6b-be2199d476ac,The Extra’s Academy Survival Guide,chapter-14,[https://cnn.kadkomi.com/komic/asura-scans/the...
4,b551dd5f-61b0-4056-a149-d12d58a347b3,The Extra’s Academy Survival Guide,chapter-9,[https://cnn.kadkomi.com/komic/asura-scans/the...


In [66]:
mp3_df = sort_chapters(mp3_df)
mp3_df.head()

Unnamed: 0,id,title,chapter,chapters
24,c4f58194-f392-4015-941b-ae0e70974bc6,The Extra’s Academy Survival Guide,0.0,[https://cnn.kadkomi.com/komic/asura-scans/the...
14,3bababfc-3be0-487e-82cc-b5a22ded87cf,The Extra’s Academy Survival Guide,1.0,[https://cnn.kadkomi.com/komic/asura-scans/the...
48,c5d994d7-3ade-455c-adca-592170832e2f,The Extra’s Academy Survival Guide,2.0,[https://cnn.kadkomi.com/komic/asura-scans/the...
22,8e096bb0-f762-4af0-9aa8-728836cc3f25,The Extra’s Academy Survival Guide,3.0,[https://cnn.kadkomi.com/komic/asura-scans/the...
31,a7542b6b-6ef4-4747-9f94-97c5caadc38d,The Extra’s Academy Survival Guide,4.0,[https://cnn.kadkomi.com/komic/asura-scans/the...


In [67]:
text_3 = extract_and_format_text(mp3_df)

100%|██████████| 51/51 [41:26<00:00, 48.76s/it]
51it [00:00, 22535.77it/s]
51it [00:00, 383350.37it/s]
51it [00:00, 131.35it/s]


In [68]:
clean_text_3 = polish_text(text_3)

100%|██████████| 51/51 [09:27<00:00, 11.13s/it]
51it [00:00, 486157.96it/s]


In [69]:
mp3_df = insert_text(clean_text_3, mp3_df)
mp3_df.head()

Unnamed: 0,id,title,chapter,chapters,text
24,c4f58194-f392-4015-941b-ae0e70974bc6,The Extra’s Academy Survival Guide,0.0,[https://cnn.kadkomi.com/komic/asura-scans/the...,"[[. THE EXTRA'S ACADEMY - SURVIVAL GUIDE, Se ..."
14,3bababfc-3be0-487e-82cc-b5a22ded87cf,The Extra’s Academy Survival Guide,1.0,[https://cnn.kadkomi.com/komic/asura-scans/the...,"[[THE EXTRA'S ACADEMY SURVIVAL GUIDE], [THIS R..."
48,c5d994d7-3ade-455c-adca-592170832e2f,The Extra’s Academy Survival Guide,2.0,[https://cnn.kadkomi.com/komic/asura-scans/the...,"[[THE EXTRA'S ACADEMY SURVIVAL GUIDE], [TAKE O..."
22,8e096bb0-f762-4af0-9aa8-728836cc3f25,The Extra’s Academy Survival Guide,3.0,[https://cnn.kadkomi.com/komic/asura-scans/the...,"[[HE EXTRA'S ACADEMY MSSURVIVAL guint, ar Koa ..."
31,a7542b6b-6ef4-4747-9f94-97c5caadc38d,The Extra’s Academy Survival Guide,4.0,[https://cnn.kadkomi.com/komic/asura-scans/the...,"[[THE EXTRA'S ACADEMY SURVIVAL GUIDE], [STAAAA..."


## MANGA NO.4

In [41]:
top_4 = top_df['title'][3]

forth = f'''
SELECT id, title, chapter, chapters
FROM chapters 
WHERE title = '{top_4}';'''

mp4_df = pd.read_sql_query(forth, engine)
mp4_df.head()

Unnamed: 0,id,title,chapter,chapters
0,1d17e826-21a8-4aca-a756-2ae6ffae7ac5,Dragon-Devouring Mage,chapter-78,[https://cnn.kadkomi.com/komic/asura-scans/dra...
1,9e522532-e8b9-4b6c-af8d-65da26adf139,Dragon-Devouring Mage,chapter-79,[https://cnn.kadkomi.com/komic/asura-scans/dra...
2,dbb54ee5-9d3b-437a-a8c5-d9f5d3dab9a6,Dragon-Devouring Mage,chapter-80,[https://cnn.kadkomi.com/komic/asura-scans/dra...
3,022c7b33-a341-4368-a804-0f61a2ddb632,Dragon-Devouring Mage,chapter-39,[https://cnn.kadkomi.com/komic/asura-scans/dra...
4,47a7d1a3-ba34-4098-82fe-82c9d1ed5ae9,Dragon-Devouring Mage,chapter-41,[https://cnn.kadkomi.com/komic/asura-scans/dra...


In [70]:
mp4_df = sort_chapters(mp4_df)
mp4_df.head()

Unnamed: 0,id,title,chapter,chapters
73,c070ad1d-1c6e-4970-8998-1fb79fdfb66b,Dragon-Devouring Mage,1.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...
36,4f3c6f8b-a7fb-4f51-b5c7-6e1c3e2de203,Dragon-Devouring Mage,2.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...
34,b3173e5f-5e42-4c2c-a153-db716a706493,Dragon-Devouring Mage,3.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...
75,62052ae8-6a89-49f0-9a0d-aadd7ed3fc14,Dragon-Devouring Mage,4.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...
32,df8647f7-ab95-4f30-9476-0691791e732f,Dragon-Devouring Mage,5.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...


In [71]:
text_4 = extract_and_format_text(mp4_df)

100%|██████████| 80/80 [44:29<00:00, 33.37s/it]
80it [00:00, 33672.28it/s]
80it [00:00, 601333.91it/s]
80it [00:00, 309.12it/s]


In [72]:
clean_text_4 = polish_text(text_4)

100%|██████████| 80/80 [10:10<00:00,  7.63s/it]
80it [00:00, 690420.41it/s]


In [73]:
mp4_df = insert_text(clean_text_4, mp4_df)
mp4_df.head()

Unnamed: 0,id,title,chapter,chapters,text
73,c070ad1d-1c6e-4970-8998-1fb79fdfb66b,Dragon-Devouring Mage,1.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...,"[[ Nr. am BS A DRAGON'S -_ HEART,, “AND A THA..."
36,4f3c6f8b-a7fb-4f51-b5c7-6e1c3e2de203,Dragon-Devouring Mage,2.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...,"[[LOWEST-GRADE MAGIC STONE CONSUMABLE, Ku WA ..."
34,b3173e5f-5e42-4c2c-a153-db716a706493,Dragon-Devouring Mage,3.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...,"[[DRAGON DEVOURING MAGE Oy], [BUT IN MY CURR..."
75,62052ae8-6a89-49f0-9a0d-aadd7ed3fc14,Dragon-Devouring Mage,4.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...,[[I CAN'T BELIEVE YOU HAVE COMPLETED ALL THOSE...
32,df8647f7-ab95-4f30-9476-0691791e732f,Dragon-Devouring Mage,5.0,[https://cnn.kadkomi.com/komic/asura-scans/dra...,"[[F DRAGON DEVOURING MAGE Tein . ay, GRAVITY. ..."


## MANGA NO.5

In [107]:
# top_5 = top_df['title'][4]

fifth = f'''
SELECT id, title, chapter, chapters
FROM chapters 
WHERE title = '{top_5_mangas[-1]}';'''

mp5_df = pd.read_sql_query(fifth, engine)
mp5_df.head()

Unnamed: 0,id,title,chapter,chapters
0,cf05cd04-b62b-4f67-b385-1de31fa39c6b,Regressor Instruction Manual,chapter-41,[https://cnn.kadkomi.com/komic/asura-scans/reg...
1,8a8ef873-35c5-40a6-8a15-7fdb841658ea,Regressor Instruction Manual,chapter-90,[https://cnn.kadkomi.com/komic/asura-scans/reg...
2,14fb9b1f-9693-434c-b844-ceaf6364c483,Regressor Instruction Manual,chapter-50-5,[https://cnn.kadkomi.com/komic/asura-scans/reg...
3,1e549fac-decc-4ef1-a244-71fc7646e9c6,Regressor Instruction Manual,chapter-62-2,[https://cnn.kadkomi.com/komic/asura-scans/reg...
4,773423e4-aef4-4930-8c9c-8f457b5788a2,Regressor Instruction Manual,chapter-50,[https://cnn.kadkomi.com/komic/asura-scans/reg...


In [108]:
mp5_df = sort_chapters(mp5_df)
mp5_df.head()

Unnamed: 0,id,title,chapter,chapters
43,8a19e9b1-1b65-4a27-8644-ba546b6e25d8,Regressor Instruction Manual,1.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...
89,ac5409cf-8221-42a8-95d8-595a31fe48a0,Regressor Instruction Manual,2.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...
42,b2dd020e-6db9-47a7-969d-c5e3f68c83ac,Regressor Instruction Manual,3.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...
88,3e98426e-e711-4609-b76f-e9791312d3fd,Regressor Instruction Manual,4.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...
87,6eb1a343-c686-4063-941d-d3bda2437d5b,Regressor Instruction Manual,5.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...


In [121]:
text_5 = extract_and_format_text(mp5_df)

100%|██████████| 118/118 [1:30:22<00:00, 45.96s/it]
118it [00:00, 23100.48it/s]
118it [00:00, 454062.27it/s]
118it [00:00, 176.06it/s]


In [122]:
clean_text_5 = polish_text(text_5)

100%|██████████| 118/118 [22:47<00:00, 11.59s/it]
118it [00:00, 502975.48it/s]


In [123]:
mp5_df = insert_text(clean_text_5, mp5_df)
mp5_df.head()

Unnamed: 0,id,title,chapter,chapters,text
43,8a19e9b1-1b65-4a27-8644-ba546b6e25d8,Regressor Instruction Manual,1.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...,[[DIDN’T ALL OF YOU SEE THE STATUS WINDOW IN F...
89,ac5409cf-8221-42a8-95d8-595a31fe48a0,Regressor Instruction Manual,2.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...,"[[peer, "" MIDNIGHT STUDIO ay En, aA Ba, h, I..."
42,b2dd020e-6db9-47a7-969d-c5e3f68c83ac,Regressor Instruction Manual,3.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...,"[[CHAPTER, TYPESETTER], [gia MIDNIGHT STUDIO A..."
88,3e98426e-e711-4609-b76f-e9791312d3fd,Regressor Instruction Manual,4.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...,"[[treat], [Keto, sh no Cane, ahs MIDNIGHT STUD..."
87,6eb1a343-c686-4063-941d-d3bda2437d5b,Regressor Instruction Manual,5.0,[https://cnn.kadkomi.com/komic/asura-scans/reg...,"[[CHAPTER], [ahs MIDNIGHT STUDIO A sen, at Bea..."


# GET OPENAI SUMMARY

In [132]:
summary_1 = get_summary(clean_text_1)
summary_2 = get_summary(clean_text_2)
summary_3 = get_summary(clean_text_3)
summary_4 = get_summary(clean_text_4)
summary_5 = get_summary(clean_text_5)

100%|██████████| 3/3 [00:20<00:00,  6.84s/it]
100%|██████████| 3/3 [00:32<00:00, 10.89s/it]
100%|██████████| 3/3 [00:35<00:00, 11.77s/it]
100%|██████████| 4/4 [00:45<00:00, 11.37s/it]


# CREATE DATAFRAME WITH ENTIRE MANGA SUMMARIES

## Creating the datframe

In [133]:
data = {'Title': [name for name in top_5_mangas],
        'Summary': [summary_1, summary_2, summary_3, summary_4, summary_5],
        'Tot_chapters': [len(mp1_df['chapter']), len(mp2_df['chapter']), len(mp3_df['chapter']), len(mp4_df['chapter']), len(mp5_df['chapter'])],
        'Length_summary': [len(summary_1), len(summary_2), len(summary_3), len(summary_4), len(summary_5)]}

In [134]:
summary_df = pd.DataFrame(data)
summary_df

Unnamed: 0,Title,Summary,Tot_chapters,Length_summary
0,Talent-Swallowing Magician,"In *Talent-Swallowing Magician*, Elric Melving...",101,2241
1,Return of the Disaster-Class Hero,**Overall Summary of Return of the Disaster-Cl...,103,1934
2,The Extra’s Academy Survival Guide,*The Extra's Academy - Survival Guide* follows...,51,1920
3,Dragon-Devouring Mage,**Summary of Dragon Devouring Mage:**The story...,80,2454
4,Regressor Instruction Manual,"In the manga, the story follows a group of cha...",118,2122


## Saving the dataframe

In [135]:
summary_df.to_csv('summaries-top5.csv', index=False)

# CREATING DATAFRAME WITH EACH CHAPTER SUMMARY

## Creating the dataframes

In [136]:
data_1 = {'ID': [id for id in mp1_df['id']],
            'Title': [title for title in mp1_df['title']],
            'Chapter': [chap for chap in mp1_df['chapter']],
            'Chapters_webp': [chaps for chaps in mp1_df['chapters']],
            'Text': [txt for txt in mp1_df['text']],
            'Summary': get_chapter_summary(clean_text_1)}

100%|██████████| 101/101 [06:42<00:00,  3.98s/it]


In [137]:
data_2 = {'ID': [id for id in mp2_df['id']],
            'Title': [title for title in mp2_df['title']],
            'Chapter': [chap for chap in mp2_df['chapter']],
            'Chapters_webp': [chaps for chaps in mp2_df['chapters']],
            'Text': [txt for txt in mp2_df['text']],
            'Summary': get_chapter_summary(clean_text_2)}

100%|██████████| 103/103 [07:57<00:00,  4.64s/it]


In [138]:
data_3 = {'ID': [id for id in mp3_df['id']],
            'Title': [title for title in mp3_df['title']],
            'Chapter': [chap for chap in mp3_df['chapter']],
            'Chapters_webp': [chaps for chaps in mp3_df['chapters']],
            'Text': [txt for txt in mp3_df['text']],
            'Summary': get_chapter_summary(clean_text_3)}

100%|██████████| 51/51 [04:48<00:00,  5.67s/it]


In [139]:
data_4 = {'ID': [id for id in mp4_df['id']],
            'Title': [title for title in mp4_df['title']],
            'Chapter': [chap for chap in mp4_df['chapter']],
            'Chapters_webp': [chaps for chaps in mp4_df['chapters']],
            'Text': [txt for txt in mp4_df['text']],
            'Summary': get_chapter_summary(clean_text_4)}

100%|██████████| 80/80 [05:45<00:00,  4.32s/it]


In [140]:
data_5 = {'ID': [id for id in mp5_df['id']],
            'Title': [title for title in mp5_df['title']],
            'Chapter': [chap for chap in mp5_df['chapter']],
            'Chapters_webp': [chaps for chaps in mp5_df['chapters']],
            'Text': [txt for txt in mp5_df['text']],
            'Summary': get_chapter_summary(clean_text_5)}

100%|██████████| 118/118 [10:36<00:00,  5.40s/it]


In [141]:
summary_df_1 = pd.DataFrame(data_1)
summary_df_2 = pd.DataFrame(data_2)
summary_df_3 = pd.DataFrame(data_3)
summary_df_4 = pd.DataFrame(data_4)
summary_df_5 = pd.DataFrame(data_5)

In [142]:
summary_df_1.head()

Unnamed: 0,ID,Title,Chapter,Chapters_webp,Text,Summary
0,a4948ddb-da17-463b-a0ce-ddca95205e39,Talent-Swallowing Magician,0.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [THE MASSIVE CO...","In this chapter of Talent-Swallowing Magician,..."
1,be6406c0-1aa6-4385-9bde-b9886e91ff90,Talent-Swallowing Magician,1.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [THE DISEASE TH...","In this chapter, we delve into the struggles o..."
2,00f61f40-f463-4b8e-8376-b4d761fac0bd,Talent-Swallowing Magician,2.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [iw Npe a rp ce...","In this chapter of Talent-Swallowing Magician,..."
3,e681d7bf-8abd-4d42-b80d-0e84a9e2793c,Talent-Swallowing Magician,3.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[AREN’T YOU ANGRY?!, MY EAR IS ABOUT TO FALL ...","In this chapter, Elric Melvinger finds himself..."
4,0ad93648-bce3-45bf-8925-6995ba65b51b,Talent-Swallowing Magician,4.0,[https://cnn.kadkomi.com/komic/asura-scans/tal...,"[[TALENT-SWALLOWING MAGICIAN], [], [IT... IT R...","In this chapter, the characters express a mix ..."


## Saving the dataframes

In [143]:
summary_df_1.to_csv(f'chapter_summaries/{summary_df_1['Title'][0]}-cs.csv', index=False)
summary_df_2.to_csv(f'chapter_summaries/{summary_df_2['Title'][0]}-cs.csv', index=False)
summary_df_3.to_csv(f'chapter_summaries/{summary_df_3['Title'][0]}-cs.csv', index=False)
summary_df_4.to_csv(f'chapter_summaries/{summary_df_4['Title'][0]}-cs.csv', index=False)
summary_df_5.to_csv(f'chapter_summaries/{summary_df_5['Title'][0]}-cs.csv', index=False)