<a href="https://colab.research.google.com/github/am88tech/gen-ai-ml/blob/main/notebook/assignment/Word2Vec_Assignment_Solution-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [None]:
# prompt: load data from google drive https://drive.google.com/file/d/1vZ4S0dtiUk5LqeeccqWs9IAAG8qH1GWv/view?usp=sharing

!gdown --id 1vZ4S0dtiUk5LqeeccqWs9IAAG8qH1GWv
!unzip News_Category_Dataset.zip

In [None]:
# Step 1: Load the Dataset
data = pd.read_json('/content/News_Category_Dataset_v3.json', lines=True)
data

# We'll use the 'headline' as the text data and 'category' as the label
data = data[['headline', 'category']].dropna()

data['processed_text'] = data['headline'].astype(str)
print(data['category'].value_counts())

#Consider top4 categories only
top_categories = ['POLITICS', 'ENTERTAINMENT', 'BUSINESS', 'SPORTS']
data = data[data['category'].isin(top_categories)]
print(data["headline"].head())
print(data['category'].value_counts())

In [None]:
## Step 3: Prepare the Data for TensorFlow

# Initialize the Tokenizer to convert text into sequences of integers.
tokenizer = Tokenizer()

# Fit the tokenizer on the 'processed_text' column to learn the vocabulary.
tokenizer.fit_on_texts(data['processed_text'])

# Convert the text in 'processed_text' column to sequences of integers.
sequences = tokenizer.texts_to_sequences(data['processed_text'])

# Define the maximum length for the sequences. Any sequences longer than this will be truncated,
# and any sequences shorter will be padded.
max_length = 100  # Maximum length of a complaint narrative

# Pad or truncate the sequences so that they all have the same length of max_length.
X = pad_sequences(sequences, maxlen=max_length)

# Convert the 'category' column to numerical values using LabelEncoder.
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['category'])

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
## Step 4: Configure the model

# Calculate the vocabulary size. The vocabulary size is the total number of unique words in the text data plus one.
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
print(vocab_size)

# Initialize a Sequential model.
model = Sequential()

# Add an Embedding layer. This layer will learn word embeddings for the input sequences.
# - input_dim: Size of the vocabulary.
# - output_dim: Dimension of the dense embedding.
# - input_length: Length of input sequences.
model.add(Embedding(input_dim=vocab_size, output_dim=32))
# Add a GlobalAveragePooling1D layer. This layer calculates the average of all the embeddings in a sequence.
# This reduces the dimensionality and helps to prevent overfitting.
model.add(GlobalAveragePooling1D())

# Add a Dense layer with 64 units and ReLU activation. This layer acts as a hidden layer in the neural network.
model.add(Dense(64, activation='relu'))

# Add a Dense output layer with a number of units equal to the number of unique labels.
# The softmax activation function is used to output a probability distribution over the classes.
model.add(Dense(len(label_encoder.classes_), activation='softmax'))
model.summary()

In [None]:
## Step 5: Train the Model

# Compile the model. The compile step specifies the optimizer, loss function, and evaluation metrics.
# - optimizer: 'adam' is a popular optimizer that adjusts the learning rate during training.
# - loss: 'sparse_categorical_crossentropy' is used for multi-class classification when labels are provided as integers.
# - metrics: 'accuracy' will track the accuracy of the model during training and evaluation.
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model using the training data.
# - X_train: Training input data.
# - y_train: Training target data.
# - epochs: Number of times to iterate over the training data.
# - batch_size: Number of samples per gradient update.
# - validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch.
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))

# Save the trained model's weights to a file.
model.save_weights('complaints_model.weights.h5')

# Load the model weights from the saved file. This step can be used to reload the model for further evaluation or inference.
model.load_weights('complaints_model.weights.h5')

In [None]:
## Step 6: Evaluate the Model

# Predict the classes for the test data.
# The model.predict method returns probabilities for each class, and np.argmax is used to get the class with the highest probability.
y_pred = np.argmax(model.predict(X_test), axis=1)

# Calculate the confusion matrix to evaluate the accuracy of the classification.
# The confusion matrix shows the number of correct and incorrect predictions for each class.
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Calculate the accuracy score by comparing the true labels with the predicted labels.
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report, which includes precision, recall, f1-score, and support for each class.
print("Classification Report:")
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)

In [None]:
# Making a prediction on new narrations

# Define a list of new complaint texts to predict their categories.
news_new = [
    """
    LOS ANGELES -- With the bases loaded and two outs against one of baseball’s
    nastiest relievers, MJ Melendez fought off pitch after pitch … after pitch after pitch … to
    keep the at-bat alive in hopes of coming through in the Royals’
    best scoring opportunity on Saturday night.
    """,
    """
    Biden campaign rakes in $28 million for star-studded Los Angeles fundraiser
    The massive haul was announced just hours before President Joe Biden appeared
    alongside former President Barack Obama, George Clooney and others.
    """
]

# Convert the new complaint texts into sequences of integers using the previously fitted tokenizer.
new_sequences = tokenizer.texts_to_sequences(news_new)

# Pad the sequences so that they all have the same length as the training data (max_length).
new_X = pad_sequences(new_sequences, maxlen=max_length)

# Predict the class probabilities for the new complaint sequences.
new_predictions = model.predict(new_X)

# Determine the predicted class for each new complaint by finding the index of the maximum probability.
pred_class = np.argmax(new_predictions, axis=1)

# Print the predicted class indices.
print(pred_class)

# Convert the predicted class indices back to class labels using the label encoder.
pred_labels = label_encoder.inverse_transform(pred_class)

# Print the predicted class labels.
print(pred_labels)