<a href="https://colab.research.google.com/github/am88tech/gen-ai-ml/blob/main/notebook/Word2Vec_v12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
import gensim
import gzip
import logging




# Bag of Words

## Sample Data

In [2]:
corpus = ['king is a strong man','queen is a wise woman','boy is a young man',
          'girl is a young woman','prince is a young','prince will be strong',
          'princess is young','man is strong','woman is pretty', 'prince is a boy',
          'prince will be king', 'princess is a girl', 'princess will be queen']
print(corpus)

['king is a strong man', 'queen is a wise woman', 'boy is a young man', 'girl is a young woman', 'prince is a young', 'prince will be strong', 'princess is young', 'man is strong', 'woman is pretty', 'prince is a boy', 'prince will be king', 'princess is a girl', 'princess will be queen']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
DTM = cv.fit_transform(corpus)
DTM = pd.DataFrame(DTM.toarray(), columns=cv.get_feature_names_out())
DTM

## DTM on Review Data

In [None]:
data = {'review': ['I loved this movie!', 'It was okay.', 'I hated it.', 'It was amazing!', 'I was disappointed.',
                   'It was a great experience.', 'I fell asleep during the movie.', 'It was a total waste of time.',
                   'I highly recommend this movie.', 'I would not recommend this movie.'],
       'sentiment': ['positive', 'neutral', 'negative', 'positive', 'negative',
                      'positive', 'negative', 'negative', 'positive', 'negative']}
df = pd.DataFrame(data)
df

In [None]:
# Convert the input data into a DTM
cv = CountVectorizer()
dtm = cv.fit_transform(df['review'])
dtm = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())
dtm["y_value"]=df["sentiment"]
# Print the DTM
dtm

# Word Embeddings

In [6]:
statements = [
"Trees tall",
"Trees green",
"Trees majestic",
"Trees essential",
"Trees diverse",
"Trees oxygen-giving",
"computers fast",
"computers smart",
"computers useful",
"computers powerful",
"computers everywhere",
"computers changing"
]

In [None]:
statements_list = []
for statement in statements:
  statements_list.append(statement.split())
print(statements_list)
from gensim.parsing.preprocessing import STOPWORDS
documents = [[word for word in document if word not in STOPWORDS] for document in statements_list]

In [8]:
from gensim.models import Word2Vec
model = Word2Vec(documents, min_count=1, vector_size=3, window = 3)
#size： size of word vector, hidden layer
#min-count：discard words that appear less than # times
#window：Context Window size

## Hyperparameters

### size
The hidden nodes size. The size of the dense vector to represent each token or word. If you have very limited data, then size should be a much smaller value. If you have lots of data, its good to experiment with various sizes.

### window
Context window size. The maximum distance between the target word and its neighboring word. If your neighbor's position is greater than the maximum window width to the left and the right, then, some neighbors are not considered as being related to the target word. In theory, a smaller window should give you terms that are more related. If you have lots of data, then the window size should not matter too much, as long as its a decent sized window.

### min_count
Minimium frequency count of words. The model would ignore words that do not statisfy the min_count. Extremely infrequent words are usually unimportant, so its best to get rid of those. Unless your dataset is really tiny, this does not really affect the model.

## Checking the word2vec output

In [None]:
for word, vector in zip(model.wv.index_to_key, model.wv.vectors):
  print(word, vector)

In [None]:
import matplotlib.pyplot as plt
# Visualize the word vectors in 3D space using PCA
vectors = model.wv.vectors


fig = plt.figure(figsize=(15,10))
ax = plt.axes(projection='3d')
ax = plt.axes(projection='3d')

xdata = vectors[:, 0]
ydata = vectors[:, 1]
zdata = vectors[:, 2]
names=model.wv.index_to_key

ax.scatter3D(xdata, ydata, zdata, s=200 , c=xdata)
for names, x, y, z in zip(names, xdata, ydata, zdata):
    label = names
    ax.text(x, y, z, label )
plt.show()


# Word2Vec Example-2

In [11]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/am88tech/gen-ai-ml/refs/heads/main/data/Amazon_Yelp_Reviews/Review_Data.csv", "Review_Data.csv")
data_file="Review_Data.csv"

In [None]:
def read_input(input_file):
    with open (input_file, 'rb') as f:
        for i, line in enumerate (f):
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)
            # read the tokenized reviews into a list
            # each review item becomes a series of words
            # so this becomes a list of lists
    print("File reading done !!")
documents = list (read_input (data_file))

In [13]:
print(documents)



In [14]:
from gensim.models import Word2Vec
model = Word2Vec(documents, min_count=1, vector_size=10)
#size： size of word vector, hidden layer
#min-count：discard words that appear less than # times
#window：Context Window size

In [None]:
# Print the vectors for a couple of words

for word, vector in zip(model.wv.index_to_key, model.wv.vectors):
  if word in ["good", "bad", "money"]:
    print(word, vector)

# Word2Vec Credit Reporting Casestudy


Detailed Code explanation on [GitHub](https://github.com/venkatareddykonasani/Codes/blob/main/Word2Vec_Document_Classification.md)

In [None]:
import pandas as pd
import numpy as np
import requests
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

## Step 1: Load the Dataset

In [None]:
!wget https://raw.githubusercontent.com/am88tech/gen-ai-ml/refs/heads/main/data/Bank_Customer_Complaints/complaints_v2.zip
!unzip -o complaints_v2.zip
complaints_data = pd.read_csv("/content/complaints_v2.csv")
complaints_data.head()

## Step2: Pre-processing

In [None]:
#lets take a sample data for building the model quickly
data=complaints_data.sample(frac=0.5, random_state=42)
print("Shape", data.shape)
print(data['product'].value_counts())
#Convert all values into text
data['processed_text'] = data['text'].astype(str)

## Step 3: Prepare the Data for TensorFlow

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_text'])
sequences = tokenizer.texts_to_sequences(data['processed_text'])

max_length = 100  # Maximum length of a complaint narrative
X = pad_sequences(sequences, maxlen=max_length)
y = data['product']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

## Step 4: Configure the model

In [None]:
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

## Step 5: Train the Model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))

In [None]:
#Save the model
model.save_weights('complaints_model.h5')

#Load the saved model
model.load_weights('complaints_model.h5')

## Step 6: Evaluate the Model

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=1)

#Confusion Matrix
cm= tf.math.confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)

In [None]:
#Making a prediction on new narration

new_complaints=[
    "payment history missing credit report made mistake put account forbearance without authorization ",
    ]

new_sequences = tokenizer.texts_to_sequences(new_complaints)
new_X = pad_sequences(new_sequences, maxlen=max_length)
new_predictions = model.predict(new_X)
pred_class=np.argmax(new_predictions, axis=1)
print(pred_class)
print("1- Credit Reporting; 2- Credit Card; 3- Debt Collection")