# One-Hot-encoding

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder


#example sentence 3 document
sentences = [
    "money gets money",
    "people earn money",
    "money make money"
]


#Reshape the data for OneHotEncoder
sentences = np.array(sentences).reshape(-1,1)
sentences

array([['money gets money'],
       ['people earn money'],
       ['money make money']], dtype='<U17')

In [2]:
#initialize Onehotencoder

#parameters Explanined
#handle_unknown = 'ignore';Ensures that unseen during encoding won't raise an error
#sparse=False:Converts the result to a dense  numpy array (default is a sparse matrix)
#get_feature_names_out(): Return the names of the one-hot-encoded feature 



#Key differentce:
#Dense = all value store 
#sparse = only non - zero value stored


encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

#fit and transform the data 
one_hot_encoded = encoder.fit_transform(sentences)
one_hot_encoded


array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [3]:
#feature names (column names after encoding )
# Correct feature names extraction
features = encoder.get_feature_names_out(['sentences'])

features



array(['sentences_money gets money', 'sentences_money make money',
       'sentences_people earn money'], dtype=object)

# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

#sample text data (documents)
sentences = [
    "money gets money",
    "people earn money",
    "money make money"
]

sentences


['money gets money', 'people earn money', 'money make money']

In [5]:
#Initialize CountVectorizer with multiple parameters
#stop_words='english'  #Remove common English stop words

#ngram_range=(1,2)  #use unigram and bigrams
#max_df=0.85, #Ignore words that appear in more than 85% of the documents
#min_df=2, #Ignore words that appear in fewer then 2 documents
#max_features = 5 #only keep the top 5 feature 



vectorizer = CountVectorizer(stop_words='english')


#fit teh model and transform the document into word vectors
X = vectorizer.fit_transform(sentences)

X.toarray()

array([[0, 1, 0, 2, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 1, 2, 0]])

In [6]:
#get feature names (vocabulary )
print("\nVocabulary with index")
print(vectorizer.vocabulary_)


Vocabulary with index
{'money': 3, 'gets': 1, 'people': 4, 'earn': 0, 'make': 2}


In [7]:
print("\nFeature Names(Vocabulary):  ")
print(vectorizer.get_feature_names_out())


Feature Names(Vocabulary):  
['earn' 'gets' 'make' 'money' 'people']


# Bi-Gram

In [8]:
import pandas as pd 
import numpy as np


df = pd.DataFrame({
    'text':['money get money',
           'People earn money'],
    'label':[0,1]
})

In [9]:
df

Unnamed: 0,text,label
0,money get money,0
1,People earn money,1


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))
X = cv.fit_transform(df['text'])
X

<2x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [11]:
cv.vocabulary_

{'money get': 2, 'get money': 1, 'people earn': 3, 'earn money': 0}

In [12]:
X.toarray()

array([[0, 1, 1, 0],
       [1, 0, 0, 1]])

# Tri Gram

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(3,3))
X = cv.fit_transform(df['text'])
X.toarray()

array([[1, 0],
       [0, 1]])

In [14]:
cv.vocabulary_

{'money get money': 0, 'people earn money': 1}

# TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.DataFrame({
    'text':['people are here','people are not here','people are lovely'],
    'label':[0,1,1]
})

In [16]:
df

Unnamed: 0,text,label
0,people are here,0
1,people are not here,1
2,people are lovely,1


In [17]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.52284231, 0.67325467, 0.        , 0.        , 0.52284231],
       [0.39148397, 0.50410689, 0.        , 0.66283998, 0.39148397],
       [0.45329466, 0.        , 0.76749457, 0.        , 0.45329466]])

In [18]:
#show the IDF values (optional)
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.         1.28768207 1.69314718 1.69314718 1.        ]
['are' 'here' 'lovely' 'not' 'people']


# countvectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

X = count_vectorizer.fit_transform(df['text']).toarray()
X

array([[1, 1, 0, 0, 1],
       [1, 1, 0, 1, 1],
       [1, 0, 1, 0, 1]])

In [20]:
X.shape

(3, 5)

In [21]:
count_vectorizer.get_feature_names_out()

array(['are', 'here', 'lovely', 'not', 'people'], dtype=object)

# Hashing Vectorizer / hashing trick

In [22]:
from sklearn.feature_extraction.text import HashingVectorizer

# Sample sentences
sentences = [
    "I love cats",
    "You love dogs",
    "We hate cats"]

# Instantiate HashingVectorizer with correct parameters
hash_vectorizer = HashingVectorizer(
    n_features=10,           # Fixed-size feature space
    alternate_sign=False,    # Only positive values
    dtype='float32',         # Save memory
    norm='l2',               # ✅ Corrected normalization ('l2')
    binary=False             # Use frequency counts instead of binary
)


# binary=True → word আছে কি নেই (1/0)

# binary=False → word কয়বার আছে সেটার উপর ভিত্তি করে weight হবে



# Transform the documents into feature vectors
X = hash_vectorizer.transform(sentences)

# Show as array
print(X.toarray())


[[0.70710677 0.70710677 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.57735026 0.         0.57735026 0.         0.
  0.         0.         0.57735026 0.        ]
 [0.57735026 0.57735026 0.         0.         0.         0.
  0.         0.57735026 0.         0.        ]]


In [23]:
x_array = X.toarray()

#display the hashed matrix
import pandas as pd
hashed_df = pd.DataFrame(x_array)
hashed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0
2,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0


# word2Vec

In [24]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/catpc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/catpc/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
#step1: preprocessing the sentence
sentence = "The cat sat on the mat alst night. Dog was barking. We love elephant"
#tokenize the sentence into words
tokens = word_tokenize(sentence.lower())

tokens

['the',
 'cat',
 'sat',
 'on',
 'the',
 'mat',
 'alst',
 'night',
 '.',
 'dog',
 'was',
 'barking',
 '.',
 'we',
 'love',
 'elephant']

In [28]:
#step 2: prepare data for word2vec
#word2vec expects a list of tokenized sentence in a list to make it a list of sentences
data = [tokens] #wrapping the tokenized sentence in a list to make it a list of sentences

data

[['the',
  'cat',
  'sat',
  'on',
  'the',
  'mat',
  'alst',
  'night',
  '.',
  'dog',
  'was',
  'barking',
  '.',
  'we',
  'love',
  'elephant']]

In [29]:
model = Word2Vec(

    sentences=data,
    vector_size=100, #size of the word vectors
    window=3, #context window size
    min_count=1, #minimum frequency of words to be include
    sg=0, #use skip-gram model 1:skip-gram 0:cbow
    workers=4 ,#Number of threads for training
    epochs=10 #number of training epochs	১০ বার পুরো ডেটার উপর ট্রেইন চালানো হবে
    
)

In [30]:
#step 4: Analyze the trained word2vec model
#get the vector for a specific word

word_vector = model.wv['cat'] #get the vector representation for 'cat'
word_vector

array([-0.00950068,  0.00956214, -0.00777185, -0.00264673, -0.00490651,
       -0.00496661, -0.00802442, -0.00778391, -0.00455399, -0.00127607,
       -0.00510379,  0.00613985, -0.00951582, -0.00530847,  0.00943814,
        0.00699138,  0.00767634,  0.00423415,  0.00050704, -0.00598122,
        0.00601702,  0.00263518,  0.00769886,  0.00639328,  0.00794293,
        0.0086571 , -0.00989551, -0.00675694,  0.00133846,  0.00644132,
        0.00737569,  0.00551627,  0.00766028, -0.00512456,  0.00658289,
       -0.00410673, -0.00905557,  0.00914286,  0.00133203, -0.0027597 ,
       -0.00247625, -0.00422078,  0.00481319,  0.00439984, -0.00265365,
       -0.00734112, -0.00356601, -0.00033689,  0.00609497, -0.00283757,
       -0.00012009,  0.00087842, -0.00709647,  0.00206604, -0.00143448,
        0.00280224,  0.00484309, -0.00135222, -0.00278072,  0.00773736,
        0.00504629,  0.00671411,  0.00451808,  0.00866735,  0.00747494,
       -0.00108131,  0.00874718,  0.00460043,  0.0054402 , -0.00

In [33]:
#find most similar words to a given word
similar_words = model.wv.most_similar('cat',topn=5)
similar_words

[('elephant', 0.25284403562545776),
 ('.', 0.13719037175178528),
 ('love', 0.0441255047917366),
 ('dog', 0.012817534618079662),
 ('barking', 0.006613576784729958)]

In [37]:
model.wv.doesnt_match(['cat','dog','elephant'])

'dog'

In [38]:
#save the model 
model.save("word2vec_model.model")

#load the model
loaded_model = Word2Vec.load("word2vec_model.model")
loaded_model

<gensim.models.word2vec.Word2Vec at 0x707d3f77a950>