# Arabic

In [8]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Arabic stopwords from nltk
stop_words = set(stopwords.words('arabic'))

def extract_keywords_with_counts(text, ngram_range=1):
    # Normalize Arabic text (optional: can add stemming/lemmatization here)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = word_tokenize(text)
    
    # Remove stopwords and non-Arabic tokens
    filtered_words = [word for word in words if word not in stop_words and re.match(r'^[\u0600-\u06FF]+$', word)]
    
    # Generate n-grams if ngram_range > 1
    if ngram_range == 1:
        ngram_words = [' '.join(ng) for ng in ngrams(filtered_words, ngram_range)]
        keywords = Counter(ngram_words).most_common(20)  # Top 10 ngrams
    else:
        keywords = Counter(filtered_words).most_common(20)  # Top 10 words
    
    # Format output: keyword and count
    return [(keyword, count) for keyword, count in keywords]

# Example usage
arabic_text = """
Unfold: محرر الفيديو والصو‪ر‬
قوالب صور وفيديو Insta
AI لتعديل الفيديو ب Videoleap
محرر مقاطع فيديو بالفلات‪ر‬
محرّر الفيديو VideoShow
اصنع فيديو بالموسيقى والصو‪ر‬
InStories: قصص للإنستغرا‪م‬
قوالب ملصقة للفيدي‪و‬
CutStory － محرّر الفيدي‪و‬
موسيقى انستقرام صانع الفيدي‪و‬
Video Up! برنامج تصميم فيدي‪و‬
تصميم فيديو & زخرفة النصو‪ص‬
قوالب ل تصميم الفيدي‪و‬
قوالب فيديو لبكرة IG الخاصة ب‪ك‬
AI لتعديل الفيديوانستا قص‪ة‬
محرر الفيديو و القوالب :Muse
قوالب فيديو للبكرات و القص‪ص‬
"""
keywords_with_counts = extract_keywords_with_counts(arabic_text, ngram_range=2)

# Print output in a clean format
for keyword, count in keywords_with_counts:
    print(f"{keyword}: {count}")


الفيديو: 7
فيديو: 6
محرر: 5
قوالب: 5
تصميم: 3
والصور: 2
لتعديل: 2
صور: 1
وفيديو: 1
مقاطع: 1
بالفلاتر: 1
اصنع: 1
بالموسيقى: 1
قصص: 1
للإنستغرام: 1
ملصقة: 1
للفيديو: 1
موسيقى: 1
انستقرام: 1
صانع: 1


# English

In [6]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# English stopwords from nltk
stop_words = set(stopwords.words('english'))

def extract_keywords_with_counts(text, ngram_range=1):
    # Normalize English text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]
    
    # Generate n-grams if ngram_range > 1
    if ngram_range == 2:
        ngram_words = [' '.join(ng) for ng in ngrams(filtered_words, ngram_range)]
        keywords = Counter(ngram_words).most_common(20)  # Top 10 ngrams
    else:
        keywords = Counter(filtered_words).most_common(20)  # Top 10 words
    
    # Format output: keyword and count
    return [(keyword, count) for keyword, count in keywords]

# Example usage
english_text = """
Splice - Video Editor & Maker
Editing movie and music videos
Unfold: Video & Photo Editor
Combine videos & add filters
InStories Reels & Story Maker
CutStory－Video & Story Editor
Reel Templates with Music
Lift: Video Reel & Story Maker
Reel Templates for Instagram
Storybeat: Reels & Story Maker
Templates, Filters, Music
Reel Creator・Make Video Clips
Vids AI - Reels Video Editor
Trending Filters & Effects
VEED - Captions for videos
Auto subtitles & translate
Video Editor : Video Maker
video crop & edit video
VITA - Video Editor & Maker
Reelsapp: reel & video editor
Reelsy Reel Maker IG Templates
Storyluxe: Templates & Filters
Vixer: Captions & Video Editor
Reels Templates & Maker
ReelTrends Templates & Sounds
Track, Analyze Social Trends
Muse: Reel Video & Story Maker
Editor Stories & Reels Trends
SlideShow Maker Photo Video ‪·‬
Video Movie Edit.or with music
Reels Templates Maker - Temply
Reel Maker
Template Planner for Instagram
Reel Templates - Recap
Make Viral Videos in seconds
Shox: Reel Maker, Video Editor
Unreels: Reel Video Editor
ReelMaker - Video templates
Stories Editor. Fonts & Music
unavailable
Reel Maker & AI Templates
Reel Maker & Templates ・ Flix
Reel Maker & Templates
Reels Maker for instagram ‪・‬
Reels Templates for Insta
Templify: Reels & Story Maker
Reel Maker - Templates for IG
Reels Templates & Reel Maker
Reel Maker - Insta Story Maker
"""
keywords_with_counts = extract_keywords_with_counts(english_text, ngram_range=2)

# Print output in a clean format
for keyword, count in keywords_with_counts:
    print(f"{keyword}: {count}")


reel maker: 9
video editor: 8
story maker: 6
maker templates: 4
reels templates: 4
reels story: 3
reel templates: 3
maker reel: 3
reel video: 3
editor maker: 2
templates filters: 2
maker video: 2
templates maker: 2
templates reel: 2
splice video: 1
maker editing: 1
editing movie: 1
movie music: 1
music videos: 1
videos unfold: 1


# French

In [12]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# French stopwords from nltk
stop_words = set(stopwords.words('french'))

def extract_keywords_with_counts(text, ngram_range=1):
    # Normalize French text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    
    # Remove stopwords and non-alphabetic tokens
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]
    
    # Generate n-grams if ngram_range > 1
    if ngram_range == 1:
        ngram_words = [' '.join(ng) for ng in ngrams(filtered_words, ngram_range)]
        keywords = Counter(ngram_words).most_common(20)  # Top 20 ngrams
    else:
        keywords = Counter(filtered_words).most_common(20)  # Top 20 words
    
    # Format output: keyword and count
    return [(keyword, count) for keyword, count in keywords]

# Example usage
french_text = """
Éditeur de vidéo en musique
Éditeur Vidéo avec AI Captions
Unfold : éditeur vidéo/photo
Modèles photos et vidéos Insta
Videoleap: Éditeur Vidéo IA
Logiciel de Montage Vidéo
VideoShow - Éditeur de vidéo
Créez vidéos avec musique
Histoires Animées Avec Musique
Teleprompteur・Auto Sous Titre
CapCut - Éditeur vidéo & photo
Montage video avec musique
Video Up! Montage Vidéo・Photo
Editeur musique・Collage Vidéos
Reel・Vidéo pour Instagram・BEAT
Montage Photo avec Musique
Vids - Editeur Reels & Stories
Modèles Vidéos et Slideshows
Sous-titres et traduction
Storyluxe: Modèles et Collage
Modèles et Filtres de Story
Éditeur de Vidéos - Vixer
Reel, Diaporama, Musique
Modèles pour vidéos Insta
Muse: Montage Reels pour Insta
Editeur Story pour Instagram
Montage video avec des modèles
Temply — Créateur de vidéos
Modèles IA, musique
Reel Modèles Maker - Recap
Reels pour Vidéo Instagram
Éditeur de Stories Insta
Éditeur vidéo et photo
"""

# Extract keywords with counts for bigrams (2-grams)
keywords_with_counts = extract_keywords_with_counts(french_text, ngram_range=1)

# Print output in a clean format
for keyword, count in keywords_with_counts:
    print(f"{keyword}: {count}")


éditeur: 9
vidéo: 8
modèles: 8
musique: 7
vidéos: 7
montage: 6
insta: 4
photo: 3
video: 3
editeur: 3
reels: 3
vidéophoto: 2
ia: 2
stories: 2
story: 2
reel: 2
instagram: 2
captions: 1
unfold: 1
photos: 1


# Vietnamese

In [4]:
import re
from collections import Counter
from pyvi import ViTokenizer

def generate_stopwords(text, top_n=30):
    """
    Generate a list of candidate stopwords based on token frequency.
    :param text: Input Vietnamese text
    :param top_n: Number of most common words to consider as stopwords
    :return: Set of stopwords
    """
    # Tokenize the text
    tokens = ViTokenizer.tokenize(text).split()
    
    # Count token frequencies
    token_counts = Counter(tokens)
    
    # Get the top_n most common words
    most_common = token_counts.most_common(top_n)
    
    # Extract just the words (not counts) to create stopwords
    stopwords = {word for word, count in most_common}
    
    return stopwords

def extract_ngrams(text, stopwords=None, ngram_range=1, top_n=20):
    """
    Extract n-grams from text and return the most common ones.
    :param text: Input Vietnamese text
    :param stopwords: Set of stopwords to exclude
    :param ngram_range: The size of n-grams (1 for unigrams, 2 for bigrams, etc.)
    :param top_n: Number of most common n-grams to return
    :return: List of n-grams with their counts
    """
    # Normalize text by removing special characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = ViTokenizer.tokenize(text).split()  # Tokenize the text
    
    # Remove stopwords if provided
    if stopwords:
        tokens = [word for word in tokens if word not in stopwords]
    
    # Generate n-grams
    ngrams = [' '.join(tokens[i:i+ngram_range]) for i in range(len(tokens) - ngram_range + 1)]
    
    # Count n-gram frequencies
    ngram_counts = Counter(ngrams)
    
    # Return the top_n most common n-grams
    return ngram_counts.most_common(top_n)

# Example Vietnamese text
vietnamese_text = """
InShot - chỉnh sửa video
ghép nhạc vào video
Mojo: Phụ đề & Reels
Trình chỉnh sửa video
VideoShow trình biên tập video
Tạo video với ảnh và âm nhạc
CapCut - Chỉnh sửa video
Ghép nhạc vào video
Video Up! Chỉnh sửa video
biên tập ghép ảnh và nhạc
làm video từ ảnh và nhạc・BEAT
inMelo - Trình chỉnh sửa video
làm video từ ảnh và nhạc
Reel Maker: Tạo Reel Video Hay
Tạo reel với nhiều templates
"""

# Generate stopwords
stopwords = generate_stopwords(vietnamese_text, top_n=30)
print("Generated Stopwords:")
print(stopwords)

# Extract unigrams
unigrams = extract_ngrams(vietnamese_text, stopwords=stopwords, ngram_range=1, top_n=20)
print("\nTop Unigrams:")
for word, count in unigrams:
    print(f"{word}: {count}")

# Extract bigrams
bigrams = extract_ngrams(vietnamese_text, stopwords=stopwords, ngram_range=2, top_n=20)
print("\nTop Bigrams:")
for word, count in bigrams:
    print(f"{word}: {count}")

# Extract trigrams
trigrams = extract_ngrams(vietnamese_text, stopwords=stopwords, ngram_range=3, top_n=20)
print("\nTop Trigrams:")
for word, count in trigrams:
    print(f"{word}: {count}")


Generated Stopwords:
{'Up', 'video', 'trình', '・', 'từ', 'vào', 'Chỉnh_sửa', 'làm', 'InShot', 'Trình', 'Phụ_đề', 'biên_tập', 'chỉnh_sửa', 'nhạc', '!', 'Tạo', 'ảnh', '-', 'và', ':', 'VideoShow', 'ghép', 'với', 'Video', 'Ghép', 'Reels', 'CapCut', 'âm_nhạc', '&', 'Mojo'}

Top Unigrams:
nhạcBEAT: 1
inMelo: 1
Reel: 1
Maker: 1
Reel_Video: 1
Hay: 1
reel: 1
nhiều: 1
templates: 1

Top Bigrams:
nhạcBEAT inMelo: 1
inMelo Reel: 1
Reel Maker: 1
Maker Reel_Video: 1
Reel_Video Hay: 1
Hay reel: 1
reel nhiều: 1
nhiều templates: 1

Top Trigrams:
nhạcBEAT inMelo Reel: 1
inMelo Reel Maker: 1
Reel Maker Reel_Video: 1
Maker Reel_Video Hay: 1
Reel_Video Hay reel: 1
Hay reel nhiều: 1
reel nhiều templates: 1
