<a href="https://colab.research.google.com/github/ac-26/Automated-Content-Tagging-Provision/blob/main/content_tagging_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import re
from typing import List, Dict, Tuple
from collections import Counter
import spacy

In [8]:
class TextEncoder:
  #initialization function
  def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModel.from_pretrained(model_name)
    self.model.eval()
    print("Model Loaded Succesfully")

  #encodes text
  def encode_text(self, text: str) -> np.ndarray:
    inputs = self.tokenizer(text, return_tensors="pt",truncation=True, padding=True, max_length=512)
    with torch.no_grad():
      outputs = self.model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy().flatten()

In [9]:
class TagVocabulary:
  #initialization function
  def __init__(self):
    self.tags = [
            # Content Creation
            "Content Writing", "Copywriting", "Blog Writing", "Article Writing",
            "Creative Writing", "Technical Writing", "Content Strategy",

            # Marketing
            "Social Media Marketing", "Digital Marketing", "Email Marketing",
            "Marketing Strategy", "Brand Marketing", "Influencer Marketing",

            # Social Media
            "Social Media", "Facebook Marketing", "Instagram Marketing",
            "Twitter Marketing", "LinkedIn Marketing", "TikTok Marketing",

            # Analytics & Testing
            "A/B Testing", "Analytics", "Performance Tracking", "Data Analysis",
            "Audience Research", "Market Research",

            # Advertising
            "Online Advertising", "Social Media Ads", "Google Ads",
            "Facebook Ads", "Digital Advertising",

            # Skills & Techniques
            "Communication Skills", "Writing Skills", "Creative Skills",
            "Marketing Skills", "Design Skills",

            # Strategy & Planning
            "Content Planning", "Marketing Planning", "Campaign Strategy",
            "Audience Targeting", "Customer Engagement"
        ]

    print(f"Tag vocabulary initialized with {len(self.tags)} tags")

    #this will return list of tags in our vocabulary
    def get_tags(self) -> List[str]:
        return self.tags.copy()

    #this will add a new tag in our vocabulary
    def add_tag(self, new_tag: str):
        if new_tag not in self.tags:
            self.tags.append(new_tag)
            print(f"Added new tag: {new_tag}")
        else:
            print(f"Tag '{new_tag}' already exists")

In [15]:
class BasicTagger:
  #initializer function
    def __init__(self):
      self.encoder = TextEncoder()
      self.vocabulary = TagVocabulary()

      self.tag_embeddings = self._encode_all_tags()

    #function to encode all tags before hand
    def _encode_all_tags(self) -> Dict[str, np.ndarray]:
        tag_embeddings = {}
        for tag in self.vocabulary.tags:
            embedding = self.encoder.encode_text(tag)
            tag_embeddings[tag] = embedding

        return tag_embeddings

    #finds tags from our vocaublary that are applicable according to our text input
    def find_matching_tags(self, input_text: str, top_k: int = 10) -> List[Tuple[str, float]]:
        # Encode the input text
        input_embedding = self.encoder.encode_text(input_text)

        similarities = []

        for tag_name, tag_embedding in self.tag_embeddings.items():
            # Calculate cosine similarity
            similarity = cosine_similarity(
                input_embedding.reshape(1, -1),
                tag_embedding.reshape(1, -1)
            )[0][0]

            similarities.append((tag_name, float(similarity)))

        similarities.sort(key=lambda x: x[1], reverse=True)

        return similarities[:top_k]

In [17]:
# Test our basic tagger
def test_basic_tagger():
    tagger = BasicTagger()

    test_text = """
    Creating social media posts is a great way to hone your content writing skills.
    Since posts are typically very short, snappy, and quick, you can easily try out
    different styles of writing and see what people respond to. It's easy to change
    direction and adapt if you need to tweak your writing style since social media
    posts are typically fluid and changeable by nature. You can also practice A/B
    testing with your social media ads—try writing two different posts and sending
    it to similar demographics and see which one performs better.
    """

    print("Input text:")
    print(test_text)

    # Find matching tags
    matching_tags = tagger.find_matching_tags(test_text, top_k=15)

    print("Top matching tags:")
    for i, (tag, score) in enumerate(matching_tags, 1):
        print(f"{i:2d}. {tag:<25} (Score: {score:.3f})")

if __name__ == "__main__":
    test_basic_tagger()

Model Loaded Succesfully
Tag vocabulary initialized with 40 tags
Input text:

    Creating social media posts is a great way to hone your content writing skills. 
    Since posts are typically very short, snappy, and quick, you can easily try out 
    different styles of writing and see what people respond to. It's easy to change 
    direction and adapt if you need to tweak your writing style since social media 
    posts are typically fluid and changeable by nature. You can also practice A/B 
    testing with your social media ads—try writing two different posts and sending 
    it to similar demographics and see which one performs better.
    
Top matching tags:
 1. Blog Writing              (Score: 0.535)
 2. Social Media Marketing    (Score: 0.493)
 3. Social Media Ads          (Score: 0.490)
 4. Social Media              (Score: 0.483)
 5. Content Writing           (Score: 0.462)
 6. Twitter Marketing         (Score: 0.413)
 7. Article Writing           (Score: 0.380)
 8. Instagr

### **Dynamic Tag Generation**

In [19]:
class KeyPhraseExtractor:
  #initialization function
  def __init__(self):
      try:
          self.nlp = spacy.load("en_core_web_sm")
      except:
          print("Please install spacy model: python -m spacy download en_core_web_sm")
          raise

      # using Parts of Speech(POS) technique of NLP, these find patterns that make good tags
      # these patterns help identify meaningful phrases
      self.phrase_patterns = [
          # single noun (e.g., "Python", "Marketing")
          ["NOUN"],
          ["PROPN"],  # proper nouns

          # adjective + noun (e.g., "Machine Learning", "Social Media")
          ["ADJ", "NOUN"],
          ["ADJ", "PROPN"],

          # noun + noun (e.g., "Data Science", "Content Strategy")
          ["NOUN", "NOUN"],
          ["PROPN", "NOUN"],
          ["NOUN", "PROPN"],

          # three-word phrases (e.g., "Natural Language Processing")
          ["ADJ", "NOUN", "NOUN"],
          ["NOUN", "NOUN", "NOUN"],
          ["PROPN", "PROPN", "PROPN"],

          # verb forms that work as tags (e.g., "Programming", "Writing")
          ["VERB"],  # this will filter for -ing forms
      ]

      print("KeyPhraseExtractor initialized successfully")



  #ths extractx key phrases on the base of POS tagging done above
  def extract_phrases(self, text: str, min_freq: int = 1) -> List[Tuple[str, int]]:
    # Process text with spaCy
    doc = self.nlp(text.lower())

    # Store found phrases with their frequencies
    phrase_counter = Counter()

    # Extract phrases based on POS patterns
    for sentence in doc.sents:
      for token_idx in range(len(sentence)):
        # Try each pattern starting from current token
        for pattern in self.phrase_patterns:
          if token_idx + len(pattern) <= len(sentence):
            # Check if tokens match the pattern
            tokens = sentence[token_idx:token_idx + len(pattern)]
            pos_sequence = [token.pos_ for token in tokens]

            if pos_sequence == pattern:
              # Additional filters
              phrase_tokens = []
              valid = True
              for token in tokens:
                # Skip stopwords in single-word phrases
                if len(pattern) == 1 and token.is_stop:
                  valid = False
                  break
                # For verbs, only keep -ing forms (gerunds)
                if token.pos_ == "VERB" and not token.text.endswith("ing"):
                  valid = False
                  break
                # Skip very short words
                if len(token.text) < 3:
                  valid = False
                  break

                phrase_tokens.append(token.text)

              if valid and phrase_tokens:
                phrase = " ".join(phrase_tokens)
                # Clean up the phrase
                phrase = re.sub(r'\s+', ' ', phrase).strip()
                if phrase:
                  phrase_counter[phrase] += 1

        # Filter by minimum frequency and return
    phrases = [(phrase, freq) for phrase, freq in phrase_counter.items()
                   if freq >= min_freq]

        # Sort by frequency (descending)
    phrases.sort(key=lambda x: x[1], reverse=True)

    return phrases

In [39]:
class PhraseScorer:
    """
    Scores and filters extracted phrases to identify the best tags.
    Uses multiple scoring factors to determine tag quality.
    """

    def __init__(self):
        # Common/generic words that make poor tags
        self.generic_words = {
            'way', 'ways', 'thing', 'things', 'people', 'person', 'time', 'times',
            'place', 'places', 'day', 'days', 'year', 'years', 'good', 'bad',
            'great', 'nice', 'sure', 'certain', 'different', 'same', 'other',
            'new', 'old', 'high', 'low', 'large', 'small', 'long', 'short',
            'easy', 'hard', 'simple', 'complex', 'nature', 'type', 'types',
            'kind', 'kinds', 'lot', 'lots', 'direction', 'need', 'needs'
        }

        # Words that boost phrase importance
        self.domain_indicators = {
            'analysis', 'strategy', 'marketing', 'development', 'management',
            'design', 'research', 'optimization', 'system', 'process', 'method',
            'technique', 'approach', 'framework', 'model', 'algorithm', 'data',
            'content', 'digital', 'social', 'media', 'online', 'software',
            'testing', 'planning', 'writing', 'creative', 'technical'
        }

        print("PhraseScorer initialized successfully")

    def calculate_phrase_scores(self, phrases: List[Tuple[str, int]],
                               text_length: int) -> List[Tuple[str, float]]:
        """
        Calculate quality scores for each phrase.

        Args:
            phrases: List of (phrase, frequency) tuples
            text_length: Total word count of original text

        Returns:
            List of (phrase, score) tuples, sorted by score
        """
        scored_phrases = []

        # Get max frequency for normalization
        max_freq = max([freq for _, freq in phrases]) if phrases else 1

        for phrase, freq in phrases:
            # Initialize scores
            scores = {
                'frequency': 0.0,
                'specificity': 0.0,
                'length': 0.0,
                'domain_relevance': 0.0,
                'completeness': 0.0
            }

            # 1. Frequency score (normalized, with diminishing returns)
            scores['frequency'] = min(freq / max_freq, 1.0) * 0.3

            # 2. Specificity score (penalize generic phrases)
            words = phrase.lower().split()
            generic_count = sum(1 for word in words if word in self.generic_words)
            scores['specificity'] = (1 - generic_count / len(words)) * 0.25

            # 3. Length score (prefer 2-3 word phrases)
            if len(words) == 1:
                scores['length'] = 0.7
            elif len(words) == 2:
                scores['length'] = 1.0
            elif len(words) == 3:
                scores['length'] = 0.9
            else:
                scores['length'] = 0.5
            scores['length'] *= 0.15

            # 4. Domain relevance (contains domain-specific terms)
            domain_word_count = sum(1 for word in words
                                  if word in self.domain_indicators)
            scores['domain_relevance'] = min(domain_word_count / len(words), 1.0) * 0.2

            # 5. Completeness score (avoid partial phrases)
            # Check if phrase seems complete (not starting/ending with common connectors)
            incomplete_markers = {'of', 'to', 'for', 'with', 'and', 'or', 'but'}
            is_complete = (words[0] not in incomplete_markers and
                          words[-1] not in incomplete_markers)
            scores['completeness'] = 1.0 if is_complete else 0.5
            scores['completeness'] *= 0.1

            # Calculate total score
            total_score = sum(scores.values())

            # Bonus for exact domain matches
            if phrase.lower() in {'a/b testing', 'content writing', 'social media',
                                 'email marketing', 'data analysis'}:
                total_score *= 1.2

            scored_phrases.append((phrase, total_score))

        # Sort by score (descending)
        scored_phrases.sort(key=lambda x: x[1], reverse=True)

        return scored_phrases


    def filter_similar_phrases(self, scored_phrases: List[Tuple[str, float]],
                              similarity_threshold: float = 0.5) -> List[Tuple[str, float]]:
        """
        Remove similar/redundant phrases, keeping the highest scoring variant.

        Args:
            scored_phrases: List of (phrase, score) tuples
            similarity_threshold: Threshold for considering phrases similar (lowered to 0.5)

        Returns:
            Filtered list of (phrase, score) tuples
        """
        filtered = []
        selected_phrases = []  # Store actual phrases for substring checking

        for phrase, score in scored_phrases:
            phrase_lower = phrase.lower()

            # Check if this phrase is redundant
            is_redundant = False

            for selected in selected_phrases:
                selected_lower = selected.lower()

                # Check if one phrase is a substring of another
                if phrase_lower in selected_lower or selected_lower in phrase_lower:
                    # Keep the longer, more specific phrase
                    is_redundant = True
                    break

                # Also check word set overlap
                words1 = set(phrase_lower.split())
                words2 = set(selected_lower.split())

                if len(words1) > 0 and len(words2) > 0:
                    intersection = words1.intersection(words2)
                    smaller_set_size = min(len(words1), len(words2))

                    # If most words overlap, consider redundant
                    if len(intersection) / smaller_set_size >= similarity_threshold:
                        is_redundant = True
                        break

            if not is_redundant:
                filtered.append((phrase, score))
                selected_phrases.append(phrase)

        return filtered

In [42]:
def test_phrase_scorer():
    # First extract phrases
    extractor = KeyPhraseExtractor()
    test_text = """
    Creating social media posts is a great way to hone your content writing skills.
    Since posts are typically very short, snappy, and quick, you can easily try out
    different styles of writing and see what people respond to. It's easy to change
    direction and adapt if you need to tweak your writing style since social media
    posts are typically fluid and changeable by nature. You can also practice A/B
    testing with your social media ads—try writing two different posts and sending
    it to similar demographics and see which one performs better.
    """

    phrases = extractor.extract_phrases(test_text)

    # Score and filter phrases
    scorer = PhraseScorer()
    word_count = len(test_text.split())

    scored_phrases = scorer.calculate_phrase_scores(phrases, word_count)
    filtered_phrases = scorer.filter_similar_phrases(scored_phrases)

    print("\nTop scored phrases (before filtering):")
    for phrase, score in scored_phrases[:10]:
        print(f"  '{phrase}' - Score: {score:.3f}")

    print("\nFiltered phrases (after removing similar ones):")
    for phrase, score in filtered_phrases[:10]:
        print(f"  '{phrase}' - Score: {score:.3f}")

    print("\nFinal tags for this text:")
    # Select tags with score > 0.6 or top 7, whichever is less
    quality_tags = [(phrase, score) for phrase, score in filtered_phrases if score > 0.6]
    if len(quality_tags) < 5:
        quality_tags = filtered_phrases[:7]
    else:
        quality_tags = quality_tags[:7]

    final_tags = [phrase for phrase, score in quality_tags]
    print(f"  {', '.join(final_tags)}")

In [43]:
# Run the test
if __name__ == "__main__":
    test_phrase_scorer()

KeyPhraseExtractor initialized successfully
PhraseScorer initialized successfully

Top scored phrases (before filtering):
  'social media' - Score: 1.110
  'writing' - Score: 0.955
  'media' - Score: 0.880
  'posts' - Score: 0.755
  'content' - Score: 0.730
  'testing' - Score: 0.730
  'social media posts' - Score: 0.693
  'social media ads' - Score: 0.693
  'media posts' - Score: 0.675
  'writing style' - Score: 0.675

Filtered phrases (after removing similar ones):
  'social media' - Score: 1.110
  'writing' - Score: 0.955
  'posts' - Score: 0.755
  'content' - Score: 0.730
  'testing' - Score: 0.730
  'similar demographics' - Score: 0.575
  'creating' - Score: 0.530
  'skills' - Score: 0.530
  'styles' - Score: 0.530
  'ads' - Score: 0.530

Final tags for this text:
  social media, writing, posts, content, testing


In [66]:
class DynamicTagger:
    """
    Complete dynamic tagging system that:
    1. Extracts key phrases from any text
    2. Scores them based on quality metrics
    3. Uses semantic embeddings to ensure relevance
    4. Returns the best tags for any domain
    """

    def __init__(self, encoder_model="sentence-transformers/all-MiniLM-L6-v2"):
        print("Initializing DynamicTagger...")

        # Initialize all components
        self.encoder = TextEncoder(encoder_model)
        self.extractor = KeyPhraseExtractor()
        self.scorer = PhraseScorer()

        print("DynamicTagger ready!")

    def generate_tags(self, text: str, max_tags: int = 10, min_score: float = 0.6) -> List[Tuple[str, float]]:
        """
        Generate tags dynamically from input text.

        Args:
            text: Input text to generate tags from
            max_tags: Maximum number of tags to return
            min_score: Minimum quality score for tags

        Returns:
            List of (tag, relevance_score) tuples
        """
        # Step 1: Extract key phrases
        phrases = self.extractor.extract_phrases(text)

        if not phrases:
            return []

        # Step 2: Score phrases for quality
        word_count = len(text.split())
        scored_phrases = self.scorer.calculate_phrase_scores(phrases, word_count)

        # Step 3: Filter redundant phrases
        filtered_phrases = self.scorer.filter_similar_phrases(scored_phrases)

        # Step 4: Apply semantic relevance using embeddings
        text_embedding = self.encoder.encode_text(text)

        # Combine quality score with semantic relevance
        final_scores = []
        for phrase, quality_score in filtered_phrases:
            # Get semantic similarity between phrase and full text
            phrase_embedding = self.encoder.encode_text(phrase)

            # Calculate cosine similarity
            semantic_score = cosine_similarity(
                text_embedding.reshape(1, -1),
                phrase_embedding.reshape(1, -1)
            )[0][0]

            # Combine scores (70% quality, 30% semantic)
            combined_score = (quality_score * 0.7) + (semantic_score * 0.3)

            final_scores.append((phrase, combined_score))

        # Sort by combined score
        final_scores.sort(key=lambda x: x[1], reverse=True)

        # Apply quality threshold
        quality_tags = [(tag, score) for tag, score in final_scores if score > min_score]

        # Ensure minimum number of tags
        if len(quality_tags) < 5 and len(final_scores) >= 5:
            quality_tags = final_scores[:5]

        # Return up to max_tags
        return quality_tags[:max_tags]

    def tag_text(self, text: str, max_tags: int = 7) -> List[str]:
        """
        Simple interface that returns just the tag strings.

        Args:
            text: Input text to tag
            max_tags: Maximum number of tags

        Returns:
            List of tag strings
        """
        tag_scores = self.generate_tags(text, max_tags)
        return [tag for tag, _ in tag_scores]


    def tag_text_with_scores(self, text: str, max_tags: int = 7) -> List[Tuple[str, float]]:
        """
        Interface that returns tags with their scores.

        Args:
            text: Input text to tag
            max_tags: Maximum number of tags

        Returns:
            List of (tag, score) tuples
        """
        return self.generate_tags(text, max_tags)

In [77]:
def test_dynamic_tagger():
    tagger = DynamicTagger()

    # Test 1: Original social media text
    print("="*60)
    print("Test 1: Social Media Marketing Text")
    print("="*60)

    test_text1 = """
    Creating social media posts is a great way to hone your content writing skills.
    Since posts are typically very short, snappy, and quick, you can easily try out
    different styles of writing and see what people respond to. It's easy to change
    direction and adapt if you need to tweak your writing style since social media
    posts are typically fluid and changeable by nature. You can also practice A/B
    testing with your social media ads—try writing two different posts and sending
    it to similar demographics and see which one performs better.
    """

    tags1 = tagger.tag_text_with_scores(test_text1)
    for tag, score in tags1:
        print(f"  '{tag}' - Score: {score:.3f}")


    # Test 2: Technical content
    print("\n" + "="*60)
    print("Test 2: Technical/Programming Text")
    print("="*60)

    test_text2 = """
    Machine learning algorithms are transforming how we process bigdata. Python
    libraries like TensorFlow and PyTorch make it easier to build neural networks
    for deep learning applications. Data scientists use these tools for predictive
    analytics and pattern recognition in complex datasets.
    """

    tags2 = tagger.tag_text_with_scores(test_text2)
    for tag, score in tags2:
        print(f"  '{tag}' - Score: {score:.3f}")

    # Test 3: Medical content
    print("\n" + "="*60)
    print("Test 3: Medical/Healthcare Text")
    print("="*60)

    test_text3 = """
    The patient presented with acute respiratory symptoms including persistent cough
    and shortness of breath. Blood tests revealed elevated white blood cell count.
    Treatment protocol included antibiotics and respiratory therapy. Follow-up
    examination showed significant improvement in lung function.
    """

    tags3 = tagger.tag_text_with_scores(test_text3)
    print("\nFinal tags with scores:")
    for tag, score in tags3:
        print(f"  '{tag}' - Score: {score:.3f}")


    # Test 4: my eg test
    print("\n" + "="*60)
    print("Test 4: My Random Example")
    print("="*60)

    test_text4 = """
    We are pleased to announce that the Viewfinders Club is now welcoming new volunteers
    who are passionate about photography and eager to contribute to capturing the vibrant
    moments of our school life.
    """

    tags4 = tagger.tag_text_with_scores(test_text4)
    print("\nFinal tags with scores:")
    for tag, score in tags4:
        print(f"  '{tag}' - Score: {score:.3f}")

In [78]:
# Run the test
if __name__ == "__main__":
    test_dynamic_tagger()

Initializing DynamicTagger...
Model Loaded Succesfully
KeyPhraseExtractor initialized successfully
PhraseScorer initialized successfully
DynamicTagger ready!
Test 1: Social Media Marketing Text
  'social media' - Score: 0.922
  'writing' - Score: 0.766
  'posts' - Score: 0.634
  'content' - Score: 0.587
  'testing' - Score: 0.555

Test 2: Technical/Programming Text
  'data' - Score: 0.660
  'learning' - Score: 0.631
  'tensorflow' - Score: 0.593
  'neural networks' - Score: 0.570
  'pytorch' - Score: 0.553

Test 3: Medical/Healthcare Text

Final tags with scores:
  'persistent cough' - Score: 0.629
  'respiratory symptoms' - Score: 0.613
  'blood' - Score: 0.584
  'lung function' - Score: 0.553
  'breath' - Score: 0.512

Test 4: My Random Example

Final tags with scores:
  'viewfinders club' - Score: 0.766
  'volunteers' - Score: 0.668
  'photography' - Score: 0.653
  'welcoming' - Score: 0.652
  'capturing' - Score: 0.619
  'school life' - Score: 0.610
