# Versiunea GPU

# Environment Sanity Check #

Click the _Runtime_ dropdown at the top of the page, then _Change Runtime Type_ and confirm the instance type is _GPU_.

Check the output of `!nvidia-smi` to make sure you've been allocated a Tesla T4, P4, or P100.

In [None]:
!nvidia-smi

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import cudf
cudf.__version__

In [None]:
import cuml
cuml.__version__

In [None]:
import cugraph
cugraph.__version__

# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Master/AI
!ls

# Import Libraries

In [None]:
!pip install transformers
!pip install gpustat==1.0.0

In [None]:
import time
import psutil
import cProfile
from gpustat import GPUStatCollection
import cudf
import cuml
from cuml.metrics import confusion_matrix
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, f1_score, precision_score, recall_score
import gensim.downloader as api
import itertools
from transformers import BertTokenizer, TFBertModel
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Choose dataset and embedding method

In [None]:
#@title Dataset

dataset_name = 'Hateval2019' #@param ["Hateval2019", "Davidson"] {allow-input: true}

print(dataset_name)

In [None]:
#@title Embedding Method

embedding_method_name = 'BERT' #@param ["ELMo", "FastText", "Word2Vec", "GloVe", "BERT"] {allow-input: true}

print(embedding_method_name)

# Define Function to Print GPU Usage

In [None]:
# Define your GPU usage function
def print_gpu_usage():
    stats = GPUStatCollection.new_query()
    for gpu in stats.gpus:
        memory_used_kb = gpu.memory_used * 1024
        print(f'GPU {gpu.index} - utilization: {gpu.utilization}% - memory used: {memory_used_kb}KB')

# Load Datasets

In [None]:
def load_hateval2019_dataset(file_path: str) -> cudf.DataFrame:
    """
    Loads the Hateval2019 dataset from a CSV file located at the specified file path
    and returns a cuDF DataFrame containing the loaded data.

    Args:
        file_path (str): The path to the CSV file containing the Hateval2019 dataset.

    Returns:
        cudf.DataFrame: A cuDF DataFrame containing the loaded data.
    """
    column_names = ["id", "text", "HS", "TR", "AG"]
    df = cudf.read_csv(file_path, names=column_names, header=0)
    return df

In [None]:
# def load_davidson_dataset(file_path: str) -> cudf.DataFrame:
#     """
#     Loads the Davidson dataset from a CSV file located at the specified file path
#     and returns a cuDF DataFrame containing the loaded data.

#     Args:
#         file_path (str): The path to the CSV file containing the Davidson dataset.

#     Returns:
#         cudf.DataFrame: A cuDF DataFrame containing the loaded data.
#     """
#     column_names = ["count", "hate_speech", "offensive_language", "neither", "class", "tweet"]
#     df = cudf.read_csv(file_path, names=column_names, header=0)
#     return df

In [None]:
# Load Hateval2019 dataset and print execution time
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before loading Hateeval-2019 dataset:")
print_gpu_usage()

hateval2019_dataset = load_hateval2019_dataset("/content/drive/MyDrive/Master/AI/DATASETS/HATEVAL/hateval2019_en.csv")

print("GPU usage after loading Hateeval-2019 dataset:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used loading Hateeval-2019 dataset: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of loading Hateeval-2019 dataset: {end-start:.2f} seconds")

In [None]:
# # Load Davidson dataset and print execution time
# start = time.time()
# memory_before = psutil.virtual_memory().used

# print("GPU usage before loading Davidson dataset:")
# print_gpu_usage()

# davidson_dataset = load_davidson_dataset("/content/drive/MyDrive/Master/AI/DATASETS/TWITTER/labeled_data.csv")

# print("GPU usage after loading Davidson dataset:")
# print_gpu_usage()
# print()

# memory_after = psutil.virtual_memory().used
# print(f"Memory used loading Davidson dataset: {(memory_after - memory_before)/1024:.2f} KB\n")

# end = time.time()
# print(f"Execution time of loading Davidson dataset: {end-start:.2f} seconds")

In [None]:
if dataset_name == 'Hateval2019':
  data = hateval2019_dataset['text']
  labels = hateval2019_dataset['HS']
# elif dataset_name == 'Davidson':
#   data = davidson_dataset['tweet']
#   labels = davidson_dataset['hate_speech']

# Preprocess Data

In [None]:
def preprocess_text(text):
    """
    Preprocesses text data by performing the following steps:
    - Removes HTML tags and URLs
    - Tokenizes the text
    - Converts tokens to lowercase
    - Removes stopwords
    - Lemmatizes tokens

    Args:
        text (str): The text data to preprocess

    Returns:
        str: A preprocessed string
    """
    # Remove HTML tags and URLs
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r'http\S+', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the preprocessed tokens back into a string
    preprocessed_text = " ".join(tokens)

    return preprocessed_text

In [None]:
# data = data.apply(preprocess_text)

# Load Embeddings

In [None]:
def load_glove_model(file_path):
    """
    Loads the GloVe model from a file located at the specified file path and 
    returns a dictionary containing the word vectors from the model.

    Args:
        file_path (str): The path to the file containing the GloVe model.

    Returns:
        dict: A Python dictionary object containing the word vectors from the 
        GloVe model. Each key in the dictionary represents a word, and the value 
        associated with the key is a numpy array that contains the corresponding 
        word vector.
    """
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    return glove_model

In [None]:
def elmo_vectors(text, batch_size=8):
    """
    Computes ELMo embeddings for the given text using a pre-trained ELMo model.

    Args:
        text (numpy.ndarray): A 1-dimensional numpy array containing the text for 
        which ELMo embeddings are to be computed.
        batch_size (int, optional): The batch size to use when computing embeddings. 
        Defaults to 8.

    Returns:
        numpy.ndarray: A 2-dimensional numpy array containing the ELMo embeddings 
        for the given text. Each row in the array represents a sentence, and each 
        column represents a dimension in the embedding space.
    """
    num_batches = len(text) // batch_size + (1 if len(text) % batch_size > 0 else 0)
    embeddings = []
    
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(text))
        batch_text = text[start_index:end_index]
        batch_embeddings = elmo(tf.convert_to_tensor(batch_text.tolist()), training=False)
        embeddings.append(batch_embeddings.numpy())

    return np.concatenate(embeddings, axis=0)

In [None]:
def fasttext_word2vec_vectors(text, model):
    """
    Computes FastText word embeddings for the given text using a pre-trained 
    FastText model.

    Args:
        text (list): A list of strings containing the text for which FastText 
        embeddings are to be computed.
        model (fasttext.FastText._FastText): The pre-trained FastText model to 
        use for computing embeddings.

    Returns:
        numpy.ndarray: A 2-dimensional numpy array containing the FastText 
        embeddings for the given text. Each row in the array represents a 
        sentence, and each column represents a dimension in the embedding space.
    """
    embeddings = []
    zero_vector = np.zeros(model.vector_size)

    for sentence in text:
        words = sentence.split()
        sentence_embeddings = [model[word] for word in words if word in model]
        
        if not sentence_embeddings:
            sentence_embeddings = [zero_vector]
        
        embeddings.append(np.mean(sentence_embeddings, axis=0))

    return np.array(embeddings)

In [None]:
def glove_vectors(text):
    """
    Computes GloVe embeddings for the given text using a pre-loaded GloVe model.

    Args:
        text (list): A list of strings containing the text for which GloVe 
        embeddings are to be computed.

    Returns:
        numpy.ndarray: A 2-dimensional numpy array containing the GloVe embeddings 
        for the given text. Each row in the array represents a sentence, and each 
        column represents a dimension in the embedding space.
    """
    embeddings = []
    
    # Get the vector size from the first word in the dictionary
    vector_size = len(next(iter(glove_model.values())))
    zero_vector = np.zeros(vector_size)

    for sentence in text:
        words = sentence.split()
        sentence_embeddings = [glove_model[word] for word in words if word in glove_model]

        if not sentence_embeddings:
            sentence_embeddings = [zero_vector]

        embeddings.append(np.mean(sentence_embeddings, axis=0))

    return np.array(embeddings)

In [None]:
def bert_vectors(text, batch_size=32):
    """
    Computes BERT embeddings for the given text using a pre-trained BERT model.

    Args:
        text (list): A list of strings containing the text for which BERT embeddings 
        are to be computed.
        batch_size (int, optional): The batch size to use when computing embeddings. 
        Defaults to 32.

    Returns:
        numpy.ndarray: A 2-dimensional numpy array containing the BERT embeddings 
        for the given text. Each row in the array represents a sentence, and each 
        column represents a dimension in the embedding space.
    """
    # Load the BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = TFBertModel.from_pretrained("bert-base-uncased")

    # Compute the number of batches
    num_batches = len(text) // batch_size + (1 if len(text) % batch_size > 0 else 0)
    embeddings = []

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, len(text))
        batch_text = text[start_index:end_index]

        # Tokenize the batch of text and generate input tensors for the model
        inputs = tokenizer(batch_text.tolist(), return_tensors="tf", padding=True, truncation=True, max_length=512)

        # Generate embeddings for the batch using the BERT model
        outputs = model(inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(batch_embeddings.numpy())

    return np.concatenate(embeddings, axis=0)

In [None]:
if embedding_method_name == "ELMo":
  elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)
  embeddings_func = elmo_vectors
elif embedding_method_name == "FastText":
  fasttext_model = api.load("fasttext-wiki-news-subwords-300")
  embeddings_func = lambda text: fasttext_word2vec_vectors(text, fasttext_model)
elif embedding_method_name == "Word2Vec":
  word2vec_model = api.load("word2vec-google-news-300")
  embeddings_func = lambda text: fasttext_word2vec_vectors(text, word2vec_model)
elif embedding_method_name == "GloVe":
  glove_model = load_glove_model('/content/drive/MyDrive/Master/AI/glove.6B.50d.txt')
  embeddings_func = glove_vectors
elif embedding_method_name == "BERT":
  bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=False)
  embeddings_func = bert_vectors

In [None]:
embeddings = embeddings_func(data.to_pandas())
print(f"{embedding_method_name} embeddings shape:", embeddings.shape)

# Split Data

In [None]:
X = cudf.DataFrame.from_pandas(pd.DataFrame(embeddings))
y = labels

X_np = X.to_pandas().values
y_np = y.to_pandas().values

X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42, stratify=y_np)

# Convert the train and test arrays back to cuDF DataFrames
X_train = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
X_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
y_train = cudf.Series(y_train.flatten())
y_test = cudf.Series(y_test.flatten())

# Machine Learning Algorithms

## Gaussian Naive Bayes

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Gaussian Naive Bayes:")
print_gpu_usage()

gnb = cuml.naive_bayes.GaussianNB()
gnb.fit(X_train, y_train)

print(f"GPU usage after running Gaussian Naive Bayes:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Gaussian Naive Bayes: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Gaussian Naive Bayes: {end-start:.2f} seconds")

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Gaussian Naive Bayes:")
print_gpu_usage()

gnb_accuracy = gnb.score(X_test, y_test)
end = time.time()

print("GPU usage after running Gaussian Naive Bayes:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Gaussian Naive Bayes: {(memory_after - memory_before)/1024:.2f} KB\n")

print(f"Execution time of Gaussian Naive Bayes: {end-start:.2f} seconds")

## Bernoulli Naive Bayes

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Bernoulli Naive Bayes:")
print_gpu_usage()

bnb = cuml.naive_bayes.BernoulliNB()
bnb.fit(X_train, y_train)

print("GPU usage after running Bernoulli Naive Bayes:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Bernoulli Naive Bayes: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Bernoulli Naive Bayes: {end-start:.2f} seconds")

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Bernoulli Naive Bayes:")
print_gpu_usage()

bnb_accuracy = gnb.score(X_test, y_test)

print("GPU usage after running Bernoulli Naive Bayes:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Bernoulli Naive Bayes: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Bernoulli Naive Bayes: {end-start:.2f} seconds")

## Logistic Regression

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Logistic Regression:")
print_gpu_usage()

lr = cuml.LogisticRegression()
lr.fit(X_train, y_train)

print("GPU usage after running Logistic Regression:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Logistic Regression: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Logistic Regression: {end-start:.2f} seconds")

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Logistic Regression:")
print_gpu_usage()

lr_accuracy = lr.score(X_test, y_test)

print("GPU usage after running Logistic Regression:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Logistic Regression: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time: {end-start:.2f} seconds")

## Support Vector Machines

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running SVM:")
print_gpu_usage()

svm = cuml.SVC()
svm.fit(X_train, y_train)

print("GPU usage after running SVM:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for SVM: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time: {end-start:.2f} seconds")

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running SVM:")
print_gpu_usage()

svm_accuracy = svm.score(X_test, y_test)

print("GPU usage after running SVM:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for SVM: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time: {end-start:.2f} seconds")

## Random Forests

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Random Forest:")
print_gpu_usage()

rf = cuml.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)

print("GPU usage after running Random Forest:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Random Forest: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Random Forest: {end-start:.2f} seconds")

In [None]:
start = time.time()
memory_before = psutil.virtual_memory().used

print("GPU usage before running Random Forest:")
print_gpu_usage()

rf_accuracy = rf.score(X_test, y_test)

print("GPU usage after running Random Forest:")
print_gpu_usage()
print()

memory_after = psutil.virtual_memory().used
print(f"Memory used for Random Forest: {(memory_after - memory_before)/1024:.2f} KB\n")

end = time.time()
print(f"Execution time of Random Forest: {end-start:.2f} seconds")

# Evaluation Metrics

## Train Accuracy

In [None]:
print(f"Gaussian Naive Bayes Train Accuracy: {gnb.score(X_train, y_train) * 100:.2f}")
print(f"Bernoulli Naive Bayes Train Accuracy: {bnb.score(X_train, y_train) * 100:.2f}")
print(f"Logistic Regression Train Accuracy: {lr.score(X_train, y_train) * 100:.2f}")
print(f"SVM Train Accuracy: {svm.score(X_train, y_train) * 100:.2f}")
print(f"Random Train Forest Accuracy: {rf.score(X_train, y_train) * 100:.2f}")

## Test Accuracy

In [None]:
print(f"Gaussian Naive Bayes Test Accuracy: {gnb_accuracy * 100:.2f}")
print(f"Bernoulli Naive Bayes Test Accuracy: {bnb_accuracy * 100:.2f}")
print(f"Logistic Regression Test Accuracy: {lr_accuracy * 100:.2f}")
print(f"SVM Test Accuracy: {svm_accuracy * 100:.2f}")
print(f"Random Test Forest Accuracy: {rf_accuracy * 100:.2f}")