In [None]:
# !pip install youtube-transcript-api
# !pip install transformers torch
# !pip install langcodes langdetect
# !pip install arabert
# !pip install --upgrade transformers
# !pip install git+https://github.com/huggingface/transformers.git

In [5]:
import regex as re #help in extracting video id pattern from url
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled #handling transcript fetching cases
from urllib.parse import urlparse, parse_qs, unquote #dealing with url

pattern = re.compile(r"(?:v=|/)([0-9A-Za-z_-]{11})(?:[&?\/]|$)")
"""
    (?:v=|/) and (?:[&?\/]|$ --> non-capturing groups
    ([0-9A-Za-z_-]{11}) --> capturing group
"""

def extract_id(url : str) -> str:
    parsed = urlparse(url) #break url into (scheme, netloc, path, params, query, fragment)
    qs = parse_qs(parsed.query) #get url id (help more with attribution_link)

    #check if it`s an attribution link and deal with it if
    if parsed.path.startswith("/attribution_link") and "u" in qs: 
        nested_url = unquote(qs["u"][0]) #unquote -> decode encoded special chars
        
        if nested_url.startswith("/"):
            re_url = "https://www.youtube.com" + nested_url
        return extract_id(re_url)

    #dealing with normal url patterns
    video_id = pattern.search(url) #search pattern in url
    if not video_id:
        raise ValueError(f"Can not extract id form: {url}")
    return video_id.group(1) #group(1) -> return first capturing group

In [6]:
def get_transcript(video_id : str, preferred_languages = ("ar", "en")) -> str:
    try:
        api = YouTubeTranscriptApi()
        transcript_list = api.list(video_id) #return all avalibale transcripts

        for language in preferred_languages: #loop over preferred languages
            try:
                transcript = transcript_list.find_transcript([language]) 
                fetched = transcript.fetch()
                return "\n".join(snippet.text for snippet in fetched if snippet.text.strip())
            except NoTranscriptFound: #if no transcript with this language
                continue

        #if no transcript with preferred languages is available, take the transcript with the original/first language
        transcript = next(iter(transcript_list)) 
        fetched = transcript.fetch()
        return "\n".join(part.text for part in fetched if part.text.strip())

    except TranscriptsDisabled: #if the video does not have a transcript
        return "NO Transcript at this video"

    except Exception as e:
        return f"ERROR: {e}"

In [7]:
stop = re.compile(r"[\p{P}\p{Zs}\n\t]") # help in finding the split position
# def get_chunks(text: str, chunk_size = 700, overlap = 50) -> list[str]:

def get_chunks(text: str, chunk_size = 500) -> list[str]:
    chunks = []

    while len(text) > chunk_size: #loop over transcript untill it ends
        match = list(stop.finditer(text[:chunk_size])) #create list of all "stop" characters in selected part

        if match:
            split_at = match[-1].end() #split position after last match
        else:
            split_at = chunk_size

        #overlap -> to make sure that every part will be understandable
        # chunks.append(text[:split_at + overlap].strip())
        # text = text[split_at - overlap:].lstrip()

        chunks.append(text[:split_at].strip())
        text = text[split_at:].lstrip()

    if text:
        chunks.append(text.strip())
    return chunks    

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langdetect import detect
import langcodes

trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [9]:
def get_lang_code(text : str) -> str:
    lang_ios_code = detect(text) #return the language ISO code
    language = langcodes.Language.get(lang_ios_code) #create a language object from code to get language script
    script = language.maximize().script

    code = f"{lang_ios_code}_{script}"
    return code

In [10]:
def translate(text, src_lang, tgt_lang):
    translator = pipeline(
        "translation",
        model = trans_model,
        tokenizer = trans_tokenizer,
        src_lang = src_lang,
        tgt_lang = tgt_lang,
        device = 0
    )

    result = translator(text,  max_length = 400)
    return result[0]["translation_text"]

In [25]:
import regex as re

def clean_text(text: str) -> str:
    #delete anything but numbers, characters, and some symbolsز
    text = re.sub(r"[^\p{L}\p{N}\s.!?,:;]", " ", text)
    
    #remove repeted spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    #remove duplicate words and phrases, both adjacent and non-adjacent.
    #remove adjacent duplication (quickly).
    for phrase_length in range(20, 0, -1):
        if phrase_length == 1:
            text = re.sub(r'\b(\w+)(\s+\1\b)+', r'\1', text)
        else:
            pattern = r'\b((?:\w+\s+){' + str(phrase_length-1) + r'}\w+)(\s+\1\b)+'
            text = re.sub(pattern, r'\1', text)
    
    #remove non-contiguous repetition of common phrases.
    def remove_non_adjacent_duplicates(text):
        words = text.split()
        if len(words) < 4:
            return text
        
        phrase_counts = {} #search for repeated phrases (2-6 words)
        
        #count the phrases
        for length in range(2, 7):  # 2 to 6 words
            for i in range(len(words) - length + 1):
                phrase = tuple(words[i:i+length])
                if phrase not in phrase_counts:
                    phrase_counts[phrase] = []
                phrase_counts[phrase].append(i)
        
        #find phrases that are repeated more than once
        duplicates_to_remove = []
        for phrase, positions in phrase_counts.items():
            if len(positions) > 1 and len(phrase) >= 2:
                #keep the first position and delete the rest
                duplicates_to_remove.extend(positions[1:])
        
        #arrange the positions for deletion from last to first
        duplicates_to_remove.sort(reverse=True)
        
        #delete duplicate phrases
        result_words = words.copy()
        for pos in duplicates_to_remove:
            #find the length of the phrase in this position
            for phrase, positions in phrase_counts.items():
                if pos in positions[1:]:  #if the position is duplicated
                    phrase_len = len(phrase)
                    #remove phrase
                    for _ in range(phrase_len):
                        if pos < len(result_words):
                            result_words.pop(pos)
                    break
        
        return ' '.join(result_words)
    
    text = remove_non_adjacent_duplicates(text)
    
    #clean spaces again after removing duplicates.
    text = re.sub(r"\s+", " ", text).strip()
    
    #remove duplicate sentences
    #divide by different punctuation marks
    sentences = re.split(r'[.!?\n]+', text)
    seen = set()
    unique_sentences = []
    
    for sentence in sentences:
        sentence = sentence.strip()
        #clean the sentence from extra spaces and marks at the beginning and end
        sentence = re.sub(r'^[^\w]+|[^\w]+$', '', sentence).strip()
        
        if sentence and len(sentence) > 2:  #ignore very short sentences.
            #compare sentences after converting them to lowercase to avoid duplicate sensitivity to letters.
            sentence_lower = sentence.lower()
            if sentence_lower not in seen:
                seen.add(sentence_lower)
                unique_sentences.append(sentence)
    
    #connect sentences with appropriate punctuation.
    if unique_sentences:
        result = ". ".join(unique_sentences)
        #add a dot at the end if it doesn't exist.
        if not result.endswith(('.', '!', '?')):
            result += "."
        return result
    else:
        return text

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
# if __name__ == "__main__": #if uncomment this line, handel indentation

# url = input("Enter a URL: ")
url = 'https://www.youtube.com/watch?v=ZZF7g6aj0Zc'
ex_id = extract_id(url)
text = get_transcript(ex_id)
chunks = get_chunks(text)

original_language = get_lang_code(chunks[0])
translated_chunks  = [translate(chunk, original_language, "eng_Latn") for chunk in chunks]

summary = [summarizer(chunk, max_length = 130)[0]['summary_text'] for chunk in translated_chunks]

combined_summary = " ".join(summary)
cleaned_final = clean_text(combined_summary)

print(cleaned_final)
print("_________________________________________________")


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Your max_length is set to 130, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Your max_length is set to 130, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 130, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Y

No, I'm not ready because I don't know what you're saying - What are we going to talk about? - Yes - Yes - Good. It's OK. Victoria, how are you? - Good. Nothing new. - Nothing new? - And you? - Me? I'm doing well because now we're recording a new video and this is a new format. - Yes - We've never recorded such a video. so I think it will be interesting and useful for the students.
_________________________________________________
How long have you been studying Russian? - How long have you been studying Russian? - I think five years. Summer? - Years - Years - Five years? - Yes. No. four. - Four years? - I don't know. Four years. - Four or five. - Yes Yes Yes. I forgot. - What do you... How do you learn? - It's okay, but. How do you do it? How do you learn Russian? What do you do when you learn Russian?
_________________________________________________
Usually watch BeFluent on YouTube. because I have you at home. with me. at home, yes. at home with me. you have me you can talk to me a