<a href="https://colab.research.google.com/github/akilash-gaddam/NLP/blob/main/nlp6_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import re
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np

In [41]:
df=pd.read_csv('Twitter_Data.csv')                    # 1. LOAD DATA
df=df.dropna(subset=["clean_text"]).iloc[:1000]
print(df)

                                             clean_text  category
0     when modi promised “minimum government maximum...      -1.0
1     talk all the nonsense and continue all the dra...       0.0
2     what did just say vote for modi  welcome bjp t...       1.0
3     asking his supporters prefix chowkidar their n...       1.0
4     answer who among these the most powerful world...       1.0
...                                                 ...       ...
996   there are two reasons for atmosphere hatred cr...       0.0
997   modi has wiped out the small micro industries ...      -1.0
998   bjp struggles find candidates west bengal graf...      -1.0
999   modis opposition trying defame him they not wa...      -1.0
1000      phir modi sarkar get lost yaar more chowkidar       1.0

[1000 rows x 2 columns]


In [42]:
all_sentences=[]          # 2. PREPROCESS & TOKENIZE
for text in df['clean_text']:
  text = re.sub(r'http\S+|www\S+|@\w+', '', text, flags=re.MULTILINE)
  tokens = re.findall(r'\w+|[^\w\s]', text.lower())
  sentence_tags=[]

  # 3. WEAK SUPERVISION (HEURISTIC TAGGING)
  for word in tokens:
    if word in ['the','a','an']:
      tag='DET'
    elif word in ['is', 'are', 'was', 'were', 'be']:
      tag='VERB'
    elif word in ['and', 'but', 'or']:
      tag='CONJ'
    elif word.endswith('ing'):
      tag='VERB'
    elif word.isdigit():
      tag='NUM'
    elif re.match(r'[^\w\s]', word):
      tag = 'PUNCT'
    else:
      tag='NOUN'
    sentence_tags.append((word,tag))
  all_sentences.append(sentence_tags)

print(all_sentences)



In [43]:
# 4. BUILD HMM PARAMETERS (Transitions and Emissions)

from collections import defaultdict, Counter

transitions = defaultdict(Counter)
emissions = defaultdict(Counter)
for sentence in all_sentences:
    prev_tag = 'START'
    for word, tag in sentence:
        transitions[prev_tag][tag] += 1
        emissions[tag][word] += 1
        prev_tag = tag
    transitions[prev_tag]['END'] += 1

transition_probs = {}
for prev_tag, counts in transitions.items():
    total = sum(counts.values())
    transition_probs[prev_tag] = {tag: c/total for tag, c in counts.items()}

emission_probs = {}
for tag, counts in emissions.items():
    total = sum(counts.values())
    emission_probs[tag] = {word: c/total for word, c in counts.items()}

print("--- HMM Parameter Snapshot ---")
print(f"P(NOUN | START): {transition_probs['START'].get('NOUN', 0):.4f}")
print(f"P('is' | VERB):   {emission_probs['VERB'].get('is', 0):.4f}")

--- HMM Parameter Snapshot ---
P(NOUN | START): 0.9130
P('is' | VERB):   0.0000


In [44]:
# 5. VITERBI DECODING (Manual implementation for one tweet)
test_tweet = ["modi", "is", "promising", "growth", "in", "india"]
states = list(emission_probs.keys())
n = len(test_tweet)
m = len(states)
viterbi_mat = np.zeros((m, n))
backpointer = np.zeros((m, n), dtype=int)
epsilon = 1e-10

for i, s in enumerate(states):
    p_trans = transition_probs.get('START', {}).get(s, epsilon)
    p_emiss = emission_probs[s].get(test_tweet[0], epsilon)
    viterbi_mat[i, 0] = p_trans * p_emiss

for t in range(1, n):
    for j, curr_s in enumerate(states):
        p_emiss = emission_probs[curr_s].get(test_tweet[t], epsilon)
        probs = [viterbi_mat[i, t-1] * transition_probs.get(states[i], {}).get(curr_s, epsilon) * p_emiss for i in range(m)]
        viterbi_mat[j, t] = max(probs)
        backpointer[j, t] = np.argmax(probs)

best_path_idx = [np.argmax(viterbi_mat[:, -1])]
for t in range(n-1, 0, -1):
    best_path_idx.insert(0, backpointer[best_path_idx[0], t])

predicted_tags = [states[i] for i in best_path_idx]

print("\n--- Viterbi Decoding Result ---")
print(f"Words: {test_tweet}")
print(f"Tags:  {predicted_tags}")


--- Viterbi Decoding Result ---
Words: ['modi', 'is', 'promising', 'growth', 'in', 'india']
Tags:  ['NOUN', 'NOUN', 'VERB', 'NOUN', 'NOUN', 'NOUN']


In [45]:
vocab = [w for s in all_sentences for w, t in s]
rare_count = len([w for w, c in Counter(vocab).items() if c == 1])

print(f"\n--- Noise Analysis ---")
print(f"Total Unique Words: {len(Counter(vocab))}")
print(f"Rare Tokens (Freq=1): {rare_count} (These cause zero-probability errors without epsilon smoothing)")


--- Noise Analysis ---
Total Unique Words: 5118
Rare Tokens (Freq=1): 3110 (These cause zero-probability errors without epsilon smoothing)
