In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- Model explainability
- Feature importance
- Data bias and model bias
- Model risk

In [2]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import joblib
import pandas as pd
import numpy as np

In [3]:
import joblib
import tensorflow as tf

# Load Word2Vec data
X_train_w2v = joblib.load('/content/drive/MyDrive/aap/X_train_w2v.joblib')
X_val_w2v = joblib.load('/content/drive/MyDrive/aap/X_val_w2v.joblib')
X_test_w2v = joblib.load('/content/drive/MyDrive/aap/X_test_w2v.joblib')


y_train_w2v = joblib.load('/content/drive/MyDrive/aap/y_train_w2v.joblib')
y_val_w2v = joblib.load('/content/drive/MyDrive/aap/y_val_w2v.joblib')


y_test = joblib.load('/content/drive/MyDrive/aap/y_test.joblib')


In [4]:
%%time
xgb_clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=400,
        max_depth=8,
        objective='multi:softmax',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state = 42
    )

    # Train the model


xgb_clf.fit(X_train_w2v, y_train_w2v)

# Make predictions on validation and test sets
y_val_pred = xgb_clf.predict(X_val_w2v)
y_pred_w2v = xgb_clf.predict(X_test_w2v)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val_w2v, y_val_pred)
# Print the validation accuracy
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

accuracy_cv = accuracy_score(y_test, y_pred_w2v)
report_cv = classification_report(y_test, y_pred_w2v)

print(f"Test Accuracy for word2vec: {accuracy_cv * 100:.2f}%")

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 76.66%
Test Accuracy for word2vec: 90.20%
CPU times: user 7min 24s, sys: 1.51 s, total: 7min 26s
Wall time: 58.8 s


In [6]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
train_path = '/content/drive/My Drive/aap/train_clean.csv'
test_path = '/content/drive/My Drive/aap/test_clean.csv'
train_data = pd.read_csv(train_path,sep = ",", header=0)
test_data = pd.read_csv(test_path, sep = ",", header=0)

In [None]:
train = train_data["preprocessed_text"]
train_target = train_data["type"]

test = test_data["preprocessed_text"]
test_target = test_data["type"]

# Create a new DataFrame for the train data with preprocessed text and target
train_combined = pd.DataFrame({
    'preprocessed_text': train_data["preprocessed_text"],
    'target': train_data["type"]
})

# Create a new DataFrame for the test data with preprocessed text and target
test_combined = pd.DataFrame({
    'preprocessed_text': test_data["preprocessed_text"],
    'target': test_data["type"]
})

In [None]:
# deal with missing data and show the missing data rate

def remove_missing_data_and_calculate_rate(df):
    # Step 1: Calculate the missing data rate
    missing_data_rate = (df.isnull().sum() / len(df)) * 100

    # Step 2: Drop rows with missing data
    df_cleaned = df.dropna()

    return df_cleaned, missing_data_rate

train_cleaned, missing_data_rate = remove_missing_data_and_calculate_rate(train_combined)


In [None]:
# Function to train Word2Vec and transform data into embeddings

def get_sentence_vector(sentence, model, vector_size):
    sentence_vector = np.zeros(vector_size)  # Initialize an empty vector
    count = 0
    for word in sentence.split():
        if word in model.wv:
            sentence_vector += model.wv[word]
            count += 1
    if count != 0:
        sentence_vector /= count  # Average the vectors
    return sentence_vector

def word2vec(train, test=None):
    # Step 1: Train Word2Vec model on training data
    word2vec_model = Word2Vec(sentences=[i.split() for i in train], vector_size=100, window=5, min_count=1, sg=0)

    # Step 2: Function to generate sentence vectors by averaging word vectors

    # Step 3: Convert train dataset to sentence vectors
    train_vectors = np.array([get_sentence_vector(sentence, word2vec_model, 100) for sentence in train])

    # If test data is provided, apply the same transformation
    if test is not None:
        test_vectors = np.array([get_sentence_vector(sentence, word2vec_model, 100) for sentence in test])
        return train_vectors, test_vectors, word2vec_model
    else:
        return train_vectors, word2vec_model


In [None]:
train_word2vec, test_word2vec, word2vec_model = word2vec(train, test)

# save the model

In [5]:
## Save the trained model
model_path = '/content/drive/MyDrive/aap/xgb_w2v_model.joblib'
joblib.dump(xgb_clf, model_path)
print(f"Model saved at {model_path}")


Model saved at /content/drive/MyDrive/aap/xgb_w2v_model.joblib


# save the embedding model

In [None]:
# Save the trained Word2Vec model
word2vec_model.save('/content/drive/MyDrive/aap/word2vec_model.model')


# for future use

In [None]:
# New test string or list of strings
new_test_data = ["This is a new test sentence", "Another example sentence for testing"]

# Use the get_sentence_vector function to convert the new data to vectors
def transform_new_data(new_data, model, vector_size):
    """
    Transform new text data into Word2Vec sentence vectors using the trained model.
    Args:
        new_data (list of str): List of sentences to be transformed.
        model (Word2Vec): Trained Word2Vec model.
        vector_size (int): Dimension of the Word2Vec embeddings.
    Returns:
        numpy.ndarray: Transformed sentence vectors.
    """
    return np.array([get_sentence_vector(sentence, model, vector_size) for sentence in new_data])

# Convert new test data to vectors


In [None]:
from gensim.models import Word2Vec

# Load the saved Word2Vec model
loaded_model = Word2Vec.load('word2vec_model.model')  # or 'word2vec_model_binary.model' for binary
model_path = '/content/drive/MyDrive/aap/xgb_w2v_model.joblib'

# Load the trained XGBoost model
xgb_clf = joblib.load(model_path)

# Now you can use the model to make predictions or further evaluate it


In [None]:
new_test_data = ["......anything you like"]

In [None]:
new_test_vectors = transform_new_data(new_test_data, loaded_model, 100)

In [None]:
y_pred_w2v = xgb_clf.predict(new_test_vectors)