<a href="https://colab.research.google.com/github/anuradha-datascience/NLP/blob/main/HandsOnNLPWithConcept_Part2_BOW-TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [173]:
import pandas as pd
import chardet
with open('SMSSpamCollection.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']


In [174]:
df=pd.read_csv('SMSSpamCollection.csv',sep="\t",names=["label","message"],encoding=encoding)

In [175]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [176]:
df.shape

(5572, 2)

In [177]:
df['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."


**Data Cleaning and Preprocessing**


1.   Text Preprocessing
  *   Tokenization
  *   Stop Words
  *   Stemming
  *   POS Tagging
  *   Lemmetization
  *   NLTK Library

2.   Word Embeddings

Frequency-based Embedding
  *   BOW
  *   TFIDF
  *   Glove

Prediction-based Embedding
  *   Word2Vec
  *   AvgWord2Vec
  *   BERT






In [178]:

import nltk
import string
#download resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [179]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [180]:
#tokenize text
df["message_preprocess"]=df['message'].apply(lambda x: word_tokenize(x))
df["message_preprocess"].head()

0    [Go, until, jurong, point, ,, crazy, .., Avail...
1             [Ok, lar, ..., Joking, wif, u, oni, ...]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, ..., U, c, alrea...
4    [Nah, I, do, n't, think, he, goes, to, usf, ,,...
Name: message_preprocess, dtype: object

In [181]:
df["message_preprocess"].loc[100]



['Please',
 'do',
 "n't",
 'text',
 'me',
 'anymore',
 '.',
 'I',
 'have',
 'nothing',
 'else',
 'to',
 'say',
 '.']

In [182]:
# Get the English stopwords list
stopwords_list = stopwords.words('english')

# Print the stopwords list
print(stopwords_list)

print(string.punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [183]:
# remove punctuations and stop words
stop_words = stopwords.words('english') + list(string.punctuation)


In [184]:
#word lowering , stop word and punctuation removal, number removal
def filter_tokens(tokens):
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    filtered_tokens=[token for token in filtered_tokens if token.isalpha()]
    return filtered_tokens

In [185]:
df["message_preprocess"]=df['message_preprocess'].apply(lambda x: filter_tokens(x))
df["message_preprocess"].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, fa, cup, final,...
3        [u, dun, say, early, hor, u, c, already, say]
4       [nah, think, goes, usf, lives, around, though]
Name: message_preprocess, dtype: object

In [186]:
df["message_preprocess"].loc[1100]

['ne',
 'thing',
 'interesting',
 'good',
 'birthday',
 'u',
 'wrking',
 'nxt',
 'started',
 'uni',
 'today']

In [187]:
#stemming vs lemmatization
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):

  tokens = [stemmer.stem(token) for token in tokens]
  return tokens

In [188]:
# df["message_preprocess_new"]=df['message_preprocess'].apply(lambda x: stemming(x))
# df["message_preprocess_new"].head()

In [189]:
tokens =["ok", "lar", "jokinging", "wif", "u", "oni","joining"]
new=stemming(tokens)
print(new)

['ok', 'lar', 'joking', 'wif', 'u', 'oni', 'join']


In [190]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatization(tokens):
    # Lemmatization with POS tagging
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lem_tokens

# Example usage
tokens = ["running", "cars", "swimming"]
lemmatized_tokens = lemmatization(tokens)
print(lemmatized_tokens)

['run', 'car', 'swim']


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [191]:
df["message_preprocess"]=df['message_preprocess'].apply(lambda x: stemming(x))
df["message_preprocess"].head()

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3        [u, dun, say, earli, hor, u, c, alreadi, say]
4         [nah, think, goe, usf, live, around, though]
Name: message_preprocess, dtype: object

In [197]:
# let's make a single function for preprocessing using nltk
def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def preprocess_data(text):
  #tokenizing
  preprocess_tokens=word_tokenize(text)

  #stop word| lowering | punctuation| only alpha
  stopwords_list = stopwords.words('english')
  preprocess_tokens = [word.lower() for word in preprocess_tokens if word.lower() not in stop_words]
  preprocess_tokens=[token for token in preprocess_tokens if token.isalpha()]

  # Lemmatization with POS tagging
  lemmatizer = WordNetLemmatizer()
  preprocess_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in preprocess_tokens]
  return preprocess_tokens

sample_df=pd.read_csv('SMSSpamCollection.csv',sep="\t",names=["label","message"],encoding=encoding)

sample_df["message_preprocess"]=sample_df['message'].apply(lambda x: preprocess_data(x))

In [198]:
sample_df.head()

Unnamed: 0,label,message,message_preprocess
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, go, usf, life, around, though]"


In [199]:
#create corpus of messages
corpus=df["message_preprocess"].tolist()
corpus[2:5]

[['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'c',
  'apply'],
 ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though']]

In [200]:
# create vocab from corpus
def create_vocab(corpus):
    vocab = set()
    for tokens in corpus:
        vocab.update(tokens)
    return vocab

# Create vocabulary
vocab = create_vocab(corpus)

# Print vocabulary
print(vocab)

{'nicky', 'velusamy', 'pace', 'mag', 'phrase', 'prince', 'xx', 'discuss', 'info', 'nag', 'lotsof', 'road', 'uin', 'simpson', 'loko', 'nokia', 'jenne', 'internal', 'uhhhhrmm', 'apology', 'forget', 'lakh', 'late', 'everytime', 'writhe', 'sterm', 'anywhere', 'culture', 'radio', 'milk', 'punish', 'language', 'sac', 'conference', 'curtsey', 'toxic', 'nutter', 'hopeso', 'relationship', 'manageable', 'lor', 'annie', 'rcvd', 'mom', 'oblisingately', 'telephone', 'remains', 'promote', 'ish', 'dancin', 'combination', 'duvet', 'achan', 'superb', 'rgent', 'sporadically', 'biola', 'bout', 'shitinnit', 'spiffing', 'saving', 'prominent', 'jenxxx', 'transaction', 'egg', 'lord', 'cock', 'rudi', 'frndz', 'slap', 'mega', 'fry', 'battle', 'triumphed', 'crisis', 'bye', 'ello', 'pocay', 'announcement', 'lage', 'yifeng', 'arnt', 'espe', 'traffic', 'spoil', 'need', 'rajnikant', 'forgotten', 'j', 'taxi', 'evo', 'libertine', 'necklace', 'sure', 'bet', 'plough', 'chest', 'jide', 'asa', 'ummmmmaah', 'liver', 'wt',

# Frequency based word Embeddings
 - BOW
 - TFIDF

In [2]:
import numpy as np
from collections import Counter
from math import log

# Corpus
corpus = [
    ["This", "movie", "is", "very", "scary", "and", "long"],
    ["This", "movie", "is", "not", "scary", "and", "is", "slow"],
    ["This", "movie", "is", "spooky", "and", "good"]
]

# Function to create Bag-of-Words matrix
def create_bow_matrix(corpus):
    unique_words = set(word for doc in corpus for word in doc)
    word_to_index = {word: i for i, word in enumerate(unique_words)}
    bow_matrix = np.zeros((len(corpus), len(unique_words)))
    for i, doc in enumerate(corpus):
        word_counts = Counter(doc)
        for word, count in word_counts.items():
            bow_matrix[i, word_to_index[word]] = count
    return bow_matrix, list(unique_words)

# Function to create TF-IDF matrix
def create_tfidf_matrix(corpus):
    bow_matrix, unique_words = create_bow_matrix(corpus)
    tfidf_matrix = np.zeros(bow_matrix.shape)
    doc_count = len(corpus)
    for i, doc in enumerate(corpus):
        word_counts = Counter(doc)
        doc_length = sum(word_counts.values())
        for word, count in word_counts.items():
            word_index = unique_words.index(word)
            tf = log(count / doc_length +1)
            idf = log(doc_count / sum(1 for doc in corpus if word in doc))
            tfidf_matrix[i, word_index] = tf * idf
    return tfidf_matrix, unique_words

# Create Bag-of-Words matrix
bow_matrix, bow_feature_names = create_bow_matrix(corpus)

# Create TF-IDF matrix
tfidf_matrix, tfidf_feature_names = create_tfidf_matrix(corpus)

# Print Bag-of-Words matrix and feature names
print("Bag-of-Words Matrix:")
print(bow_matrix)
print("\nFeature Names:")
print(bow_feature_names)

# Print TF-IDF matrix and feature names
print("\nTF-IDF Matrix:")
print(tfidf_matrix)
print("\nFeature Names:")
print(tfidf_feature_names)

Bag-of-Words Matrix:
[[0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1.]
 [0. 1. 1. 0. 1. 1. 0. 1. 2. 1. 0.]
 [1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0.]]

Feature Names:
['spooky', 'This', 'not', 'very', 'movie', 'and', 'good', 'scary', 'is', 'slow', 'long']

TF-IDF Matrix:
[[0.         0.         0.         0.14669923 0.         0.
  0.         0.05414232 0.         0.         0.14669923]
 [0.         0.         0.12939789 0.         0.         0.
  0.         0.04775691 0.         0.12939789 0.        ]
 [0.16935183 0.         0.         0.         0.         0.
  0.16935183 0.         0.         0.         0.        ]]

Feature Names:
['spooky', 'This', 'not', 'very', 'movie', 'and', 'good', 'scary', 'is', 'slow', 'long']
