### Text preprocessing input to vector

### bag of words (BOW)

In [2]:
import pandas as pd 

In [None]:
# Load the dataset from a text file
# The file is expected to be tab-separated with two columns: 'label' and 'message'
# The first column is the label (spam or ham) and the second column is the message text
txt = pd.read_csv('SMSSpamCollection.txt',sep='\t', names=['label', 'message'])
txt

Unnamed: 0,label,message
0,ham,Huh y lei...
1,spam,REMINDER FROM O2: To get 2.50 pounds free call...
2,spam,This is the 2nd time we have tried 2 contact u...
3,ham,Will ü b going to esplanade fr home?
4,ham,"Pity, * was in mood for that. So...any other s..."
5,spam,WINNER!! As a valued network customer you have...
6,ham,The guy did some bitching but I acted like i'd...
7,ham,Rofl. Its true to its name
8,spam,URGENT! You have won a 1 week FREE membership ...


In [6]:
import re 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [7]:
porterstem = PorterStemmer()

In [None]:
corpus = []

for i in range(len(txt)):
    msg = re.sub('[^a-zA-Z]', ' ', txt['message'][i]) # Remove non-alphabetic characters
    msg = msg.lower() # Convert to lowercase
    msg = msg.split() # Split into words
    msg = [porterstem.stem(w) for w in msg if w not in set(stopwords.words('english'))] # Remove stopwords and apply stemming
    msg = ' '.join(msg) # Join the words back into a single string
    corpus.append(msg) # append to the corpus
    

In [9]:
corpus

['huh lei',
 'remind get pound free call credit detail great offer pl repli text valid name hous postcod',
 'nd time tri contact u u pound prize claim easi call p per minut bt nation rate',
 'b go esplanad fr home',
 'piti mood suggest',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'guy bitch act like interest buy someth els next week gave us free',
 'rofl true name',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw']

In [10]:
# create bow

In [None]:
from  sklearn.feature_extraction.text import CountVectorizer # count vectorizer convert text to bag of words
# We will use a binary count vectorizer that only counts the presence or absence of words
# and limits the maximum number of features to 100 for 100 most frequent words
cv = CountVectorizer(max_features=100, binary=True) 

In [None]:
x = cv.fit_transform(corpus).toarray()
# Convert the labels to a binary format (0 for ham, 1 for spam)

In [None]:
import numpy as np
np.set_printoptions(edgeitems=10, linewidth=1000, formatter=dict(float= lambda x: "%.3g" % x)) # Set print options for better readability
x # Convert labels to binary format numpy array

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
cv.vocabulary_  #showes dictionary of all n-grams which are used as features in the model

{'huh': np.int64(25),
 'lei': np.int64(31),
 'remind': np.int64(52),
 'get': np.int64(18),
 'pound': np.int64(48),
 'free': np.int64(16),
 'call': np.int64(4),
 'credit': np.int64(8),
 'detail': np.int64(11),
 'great': np.int64(20),
 'offer': np.int64(42),
 'pl': np.int64(45),
 'repli': np.int64(53),
 'text': np.int64(60),
 'valid': np.int64(67),
 'name': np.int64(36),
 'hous': np.int64(24),
 'postcod': np.int64(47),
 'nd': np.int64(38),
 'time': np.int64(61),
 'tri': np.int64(62),
 'contact': np.int64(7),
 'prize': np.int64(49),
 'claim': np.int64(5),
 'easi': np.int64(12),
 'per': np.int64(43),
 'minut': np.int64(34),
 'bt': np.int64(2),
 'nation': np.int64(37),
 'rate': np.int64(50),
 'go': np.int64(19),
 'esplanad': np.int64(14),
 'fr': np.int64(15),
 'home': np.int64(22),
 'piti': np.int64(44),
 'mood': np.int64(35),
 'suggest': np.int64(59),
 'winner': np.int64(70),
 'valu': np.int64(68),
 'network': np.int64(40),
 'custom': np.int64(9),
 'select': np.int64(57),
 'receivea': np.i

In [None]:
# Create the Bag of Words with ngrams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100, binary=True, ngram_range=(2, 3)) # Create a CountVectorizer with n-grams (bigrams and trigrams)
# ngram_range=(2, 3) means we will consider both bigrams and trigrams
x = cv.fit_transform(corpus).toarray() # Convert the corpus to a bag of words with n-grams

In [None]:
cv.vocabulary_ # Display the vocabulary created by the CountVectorizer it will show biagram and trigram features cz we set ngram_range=(2, 3)

{'huh lei': np.int64(56),
 'get pound': np.int64(47),
 'pound free': np.int64(94),
 'free call': np.int64(41),
 'call credit': np.int64(10),
 'credit detail': np.int64(26),
 'detail great': np.int64(32),
 'great offer': np.int64(51),
 'offer pl': np.int64(84),
 'pl repli': np.int64(90),
 'name hous': np.int64(73),
 'hous postcod': np.int64(55),
 'get pound free': np.int64(48),
 'pound free call': np.int64(95),
 'free call credit': np.int64(42),
 'call credit detail': np.int64(11),
 'credit detail great': np.int64(27),
 'detail great offer': np.int64(33),
 'great offer pl': np.int64(52),
 'offer pl repli': np.int64(85),
 'pl repli text': np.int64(91),
 'name hous postcod': np.int64(74),
 'nd time': np.int64(76),
 'contact pound': np.int64(24),
 'pound prize': np.int64(96),
 'prize claim': np.int64(98),
 'claim easi': np.int64(18),
 'easi call': np.int64(34),
 'call per': np.int64(12),
 'per minut': np.int64(86),
 'minut bt': np.int64(70),
 'bt nation': np.int64(4),
 'nation rate': np.in

In [None]:
x 

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### TF-IDF Vectorization

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
doc = ['i like nlp', 'nlp is fun', 'i like python']

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doc) # it will give 2d matrix document x vocabulary

In [9]:
vectorizer.get_feature_names_out()

array(['fun', 'is', 'like', 'nlp', 'python'], dtype=object)

In [None]:
tfidf_matrix.toarray() # Display the TF-IDF matrix as a NumPy array 
#it will show array of (3, 5) shape where 3 is number of documents and 5 is number of features

array([[0.        , 0.        , 0.70710678, 0.70710678, 0.        ],
       [0.62276601, 0.62276601, 0.        , 0.4736296 , 0.        ],
       [0.        , 0.        , 0.60534851, 0.        , 0.79596054]])

### n-grams

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(2,2)) # Create a TfidfVectorizer with bigrams (minimum 2 words, maximum 2 words)
x = vectorizer.fit_transform(doc) # Convert the documents to a TF-IDF matrix with bigrams

In [13]:
vectorizer.get_feature_names_out() # Get the feature names (words) in the TF-IDF matrix

array(['is fun', 'like nlp', 'like python', 'nlp is'], dtype=object)

In [None]:
x.toarray() # Display the TF-IDF matrix as a NumPy array

array([[0.        , 1.        , 0.        , 0.        ],
       [0.70710678, 0.        , 0.        , 0.70710678],
       [0.        , 0.        , 1.        , 0.        ]])