# Dataset Loading

In [1]:
import pandas as pd

# Sample dataset
data = {
    "text": [
        "Hey there! How are you? <br> Visit https://example.com now!!! 😊",
        "FREE entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121",
        "Hello friend, long time no see. Wanna catch up soon?",
        "WINNER!! You have won a $1000 Walmart gift card. Go to http://bit.ly/12345 to claim now!"
    ]
}
df = pd.DataFrame(data)
df


Unnamed: 0,text
0,Hey there! How are you? <br> Visit https://exa...
1,FREE entry in 2 a wkly comp to win FA Cup fina...
2,"Hello friend, long time no see. Wanna catch up..."
3,WINNER!! You have won a $1000 Walmart gift car...


# 1. Lowercasing

In [2]:
df['text'] = df['text'].str.lower()
df

Unnamed: 0,text
0,hey there! how are you? <br> visit https://exa...
1,free entry in 2 a wkly comp to win fa cup fina...
2,"hello friend, long time no see. wanna catch up..."
3,winner!! you have won a $1000 walmart gift car...


# 2. Remove HTML Tags

In [3]:
df

Unnamed: 0,text
0,hey there! how are you? <br> visit https://exa...
1,free entry in 2 a wkly comp to win fa cup fina...
2,"hello friend, long time no see. wanna catch up..."
3,winner!! you have won a $1000 walmart gift car...


In [4]:
import re

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

df['text'] = df['text'].apply(lambda x: remove_html_tags(x))
df

Unnamed: 0,text
0,hey there! how are you? visit https://example...
1,free entry in 2 a wkly comp to win fa cup fina...
2,"hello friend, long time no see. wanna catch up..."
3,winner!! you have won a $1000 walmart gift car...


# 3. Remove URLs

In [5]:
df.iloc[0,0]

'hey there! how are you?  visit https://example.com now!!! 😊'

In [6]:
df.iloc[0]

Unnamed: 0,0
text,hey there! how are you? visit https://example...


In [7]:
import re

def remove_url(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text)

df['text'] = df['text'].apply(lambda x: remove_url(x))
df

Unnamed: 0,text
0,hey there! how are you? visit now!!! 😊
1,free entry in 2 a wkly comp to win fa cup fina...
2,"hello friend, long time no see. wanna catch up..."
3,winner!! you have won a $1000 walmart gift car...


# 4. Remove Punctuation

In [8]:
import string
exclude = string.punctuation

def remove_punc(text):
  return text.translate(str.maketrans('', '', exclude))

df['text'] = df['text'].apply(remove_punc)
df['text']

Unnamed: 0,text
0,hey there how are you visit now 😊
1,free entry in 2 a wkly comp to win fa cup fina...
2,hello friend long time no see wanna catch up soon
3,winner you have won a 1000 walmart gift card g...


# 5. Chat Word Treatment

In [9]:
chat_words = {
    "u": "you",
    "ur": "your",
    "wkly": "weekly",
    "comp": "competition",
    "fa": "football association",
    "tkts": "tickets",
    "wanna": "want to"
}

In [10]:
def chat_word_treatment(text):
  tokens=text.split()
  return  " ".join([chat_words[word] if word in chat_words else word for word in tokens])

In [11]:
df['text']=df['text'].apply(chat_word_treatment)
df['text']


Unnamed: 0,text
0,hey there how are you visit now 😊
1,free entry in 2 a weekly competition to win fo...
2,hello friend long time no see want to catch up...
3,winner you have won a 1000 walmart gift card g...


# 6. Spelling Correction

In [12]:
import pandas as pd
import random

# Some base sentences with intentional spelling mistakes
sentences = [
    "Helo frend longg time no seee",
    "I amm veryy happi todayy",
    "Thiss is a smple sentence withh erors",
    "Speling corection is importent for NLP pipline",
    "Wee aree lernning data sciense togeter",
    "Pythonn is a powrfull prograaming langage",
    "Toknization and lemmitization aree differnt"
]

# Shuffle for randomness
random.shuffle(sentences)

# Create DataFrame
df_1 = pd.DataFrame(sentences, columns=["text"])
df_1


Unnamed: 0,text
0,I amm veryy happi todayy
1,Wee aree lernning data sciense togeter
2,Helo frend longg time no seee
3,Speling corection is importent for NLP pipline
4,Thiss is a smple sentence withh erors
5,Toknization and lemmitization aree differnt
6,Pythonn is a powrfull prograaming langage


In [13]:
from textblob import TextBlob

def correct_spelling(text):
  return str(TextBlob(text).correct())

df['text']=df['text'].apply(correct_spelling)
df['text']

Unnamed: 0,text
0,hey there how are you visit now 😊
1,free entry in 2 a weekly competition to win fo...
2,hello friend long time no see want to catch up...
3,winner you have won a 1000 palmar gift card go...


# 7. Remove Stop Words

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  if text is not None:
    tokens = word_tokenize(text)
    return " ".join([word for word in tokens if word.lower() not in stop_words])
  else:
    return None

df['text'] = df['text'].apply(remove_stopwords)
df['text']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,text
0,hey visit 😊
1,free entry 2 weekly competition win football a...
2,hello friend long time see want catch soon
3,winner 1000 palmar gift card go claim


# 8. Handling Emojis

In [15]:
!pip install emoji



In [16]:
df.iloc[0,0]

'hey visit 😊'

In [17]:
import emoji

def handle_emojis(text):
  return emoji.demojize(text)

df['text'] = df['text'].apply(handle_emojis)
df['text']

Unnamed: 0,text
0,hey visit :smiling_face_with_smiling_eyes:
1,free entry 2 weekly competition win football a...
2,hello friend long time see want catch soon
3,winner 1000 palmar gift card go claim


# 9. Tokenization

In [18]:
from nltk.tokenize import word_tokenize
# using words tokenization
def tokenize_text(text):
    return word_tokenize(text)

df["tokens"] = df["text"].apply(tokenize_text)
df[["text", "tokens"]]


Unnamed: 0,text,tokens
0,hey visit :smiling_face_with_smiling_eyes:,"[hey, visit, :, smiling_face_with_smiling_eyes..."
1,free entry 2 weekly competition win football a...,"[free, entry, 2, weekly, competition, win, foo..."
2,hello friend long time see want catch soon,"[hello, friend, long, time, see, want, catch, ..."
3,winner 1000 palmar gift card go claim,"[winner, 1000, palmar, gift, card, go, claim]"


In [19]:
from nltk.tokenize import sent_tokenize
# using sentence tokenization
def tokenize_text(text):
    return sent_tokenize(text)

df["tokens"] = df["text"].apply(tokenize_text)
df[["text", "tokens"]]


Unnamed: 0,text,tokens
0,hey visit :smiling_face_with_smiling_eyes:,[hey visit :smiling_face_with_smiling_eyes:]
1,free entry 2 weekly competition win football a...,[free entry 2 weekly competition win football ...
2,hello friend long time see want catch soon,[hello friend long time see want catch soon]
3,winner 1000 palmar gift card go claim,[winner 1000 palmar gift card go claim]


# 10. Stemming

In [20]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

df["stemmed"] = df["text"].apply(stem_words)
df[["text", "stemmed"]]


Unnamed: 0,text,stemmed
0,hey visit :smiling_face_with_smiling_eyes:,"[h, e, y, , v, i, s, i, t, , :, s, m, i, l, ..."
1,free entry 2 weekly competition win football a...,"[f, r, e, e, , e, n, t, r, y, , 2, , w, e, ..."
2,hello friend long time see want catch soon,"[h, e, l, l, o, , f, r, i, e, n, d, , l, o, ..."
3,winner 1000 palmar gift card go claim,"[w, i, n, n, e, r, , 1, 0, 0, 0, , p, a, l, ..."


# 11. Lemmatization

In [21]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Sample sentence
sentence = "He is running and they were eating apples while dogs barked loudly"

# Tokenize
tokens = word_tokenize(sentence)

# Lemmatize only verbs
lemmatized_verbs = [lemmatizer.lemmatize(word, pos='v') for word in tokens]

print("Original:", tokens)
print("After Lemmatization (verbs only):", lemmatized_verbs)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original: ['He', 'is', 'running', 'and', 'they', 'were', 'eating', 'apples', 'while', 'dogs', 'barked', 'loudly']
After Lemmatization (verbs only): ['He', 'be', 'run', 'and', 'they', 'be', 'eat', 'apples', 'while', 'dog', 'bark', 'loudly']


# Dataset Setup

In [22]:
import pandas as pd

# Sample dataset
data = {
    "text": [
        "I love machine learning",
        "Machine learning is fun",
        "Deep learning and machine learning are related"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text
0,I love machine learning
1,Machine learning is fun
2,Deep learning and machine learning are related


# 1. Bag of Words (BoW)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Create BoW model
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df["text"])

# Convert to DataFrame for visualization
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df


Unnamed: 0,and,are,deep,fun,is,learning,love,machine,related
0,0,0,0,0,0,1,1,1,0
1,0,0,0,1,1,1,0,1,0
2,1,1,1,0,0,2,0,1,1


# 2. TF-IDF (Term Frequency – Inverse Document Frequency)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF model
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["text"])

# Convert to DataFrame for visualization
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df


Unnamed: 0,and,are,deep,fun,is,learning,love,machine,related
0,0.0,0.0,0.0,0.0,0.0,0.453295,0.767495,0.453295,0.0
1,0.0,0.0,0.0,0.608845,0.608845,0.359594,0.0,0.359594,0.0
2,0.417242,0.417242,0.417242,0.0,0.0,0.492859,0.0,0.24643,0.417242


# 3. N-grams, Uni-grams, and Bi-grams

In [25]:
# Unigrams (default)
vectorizer_uni = CountVectorizer(ngram_range=(1,1))
X_uni = vectorizer_uni.fit_transform(df["text"])
unigram_df = pd.DataFrame(X_uni.toarray(), columns=vectorizer_uni.get_feature_names_out())
print("Unigrams:")
print(unigram_df)

# Bigrams
vectorizer_bi = CountVectorizer(ngram_range=(2,2))
X_bi = vectorizer_bi.fit_transform(df["text"])
bigram_df = pd.DataFrame(X_bi.toarray(), columns=vectorizer_bi.get_feature_names_out())
print("\nBigrams:")
print(bigram_df)

# Trigrams
vectorizer_tri = CountVectorizer(ngram_range=(3,3))
X_tri = vectorizer_tri.fit_transform(df["text"])
trigram_df = pd.DataFrame(X_tri.toarray(), columns=vectorizer_tri.get_feature_names_out())
print("\nTrigrams:")
print(trigram_df)


Unigrams:
   and  are  deep  fun  is  learning  love  machine  related
0    0    0     0    0   0         1     1        1        0
1    0    0     0    1   1         1     0        1        0
2    1    1     1    0   0         2     0        1        1

Bigrams:
   and machine  are related  deep learning  is fun  learning and  \
0            0            0              0       0             0   
1            0            0              0       1             0   
2            1            1              1       0             1   

   learning are  learning is  love machine  machine learning  
0             0            0             1                 1  
1             0            1             0                 1  
2             1            0             0                 1  

Trigrams:
   and machine learning  deep learning and  learning and machine  \
0                     0                  0                     0   
1                     0                  0                     0

# Install & Import Required Libraries

In [26]:
! pip uninstall -y gensim numpy scipy
! pip install gensim

Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
Collecting gensim
  Using cached gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Using cached scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.ma

In [27]:
import nltk
nltk.download('punkt')

import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Simple Dataset for Word2Vec

In [28]:
# Sample sentences
sentences = [
    "Game of Thrones is a great show",
    "Jon Snow knows nothing",
    "Daenerys Targaryen is the mother of dragons",
    "Winter is coming",
    "The Lannisters always pay their debts"
]

# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
tokenized_sentences


[['game', 'of', 'thrones', 'is', 'a', 'great', 'show'],
 ['jon', 'snow', 'knows', 'nothing'],
 ['daenerys', 'targaryen', 'is', 'the', 'mother', 'of', 'dragons'],
 ['winter', 'is', 'coming'],
 ['the', 'lannisters', 'always', 'pay', 'their', 'debts']]

# Word2Vec with CBOW (Default)

In [29]:
# CBOW model
cbow_model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=3, min_count=1, sg=0)
cbow_model.save("cbow_model.model")

# Get word vector
vector_jon = cbow_model.wv["jon"]
print("Vector for 'jon':", vector_jon[:5])  # show first 5 values

# Similar words
print("Most similar to 'jon':", cbow_model.wv.most_similar("jon"))


Vector for 'jon': [ 0.00805289  0.00869482  0.01991482 -0.00894614 -0.00277849]
Most similar to 'jon': [('thrones', 0.22442315518856049), ('their', 0.12586486339569092), ('targaryen', 0.11843740940093994), ('is', 0.09985582530498505), ('always', 0.09696212410926819), ('a', 0.08994414657354355), ('game', 0.07390209287405014), ('knows', 0.05837707966566086), ('winter', 0.04891873523592949), ('of', 0.001368667115457356)]


# Word2Vec with Skip-gram

In [30]:
# Skip-gram model
skip_model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=3, min_count=1, sg=1)
skip_model.save("skip_model.model")

# Similar words
print("Most similar to 'dragon':", skip_model.wv.most_similar("dragons"))


Most similar to 'dragon': [('of', 0.22978255152702332), ('knows', 0.22593113780021667), ('lannisters', 0.16488909721374512), ('nothing', 0.12749089300632477), ('the', 0.10523580759763718), ('mother', 0.09503234922885895), ('is', 0.05954967439174652), ('pay', 0.058207228779792786), ('winter', 0.0548836812376976), ('snow', 0.051361482590436935)]


# Game of Thrones Word2Vec

In [31]:
# Train Word2Vec on full GOT dataset (replace with real text for better results)
got_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=1)

# Find words similar to 'winter'
similar_words = got_model.wv.most_similar("winter")
print("Words similar to 'winter':", similar_words)


Words similar to 'winter': [('targaryen', 0.24666325747966766), ('coming', 0.11935960501432419), ('great', 0.11928388476371765), ('their', 0.1166219711303711), ('snow', 0.09614861011505127), ('of', 0.08546063303947449), ('thrones', 0.07172605395317078), ('knows', 0.059700846672058105), ('debts', 0.04119439423084259), ('show', 0.012430463917553425)]
