# **Clasificación de texto con Xlnet para ver si es phishing**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
from tensorflow.keras.optimizers import Adam


In [None]:
pip install transformers

In [None]:
# Cargar el conjunto de datos
df = pd.read_csv('C:/Users/Bryan/codigo/Transformer_Phishing_Tesis/Data/phishing_dataset.csv')
df

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
xlnet_model = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)

def encode_reviews(tokenizer, reviews, max_length=120):
    return tokenizer(reviews, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

train_encodings = encode_reviews(tokenizer, train['contexts'].tolist())
test_encodings = encode_reviews(tokenizer, test['contexts'].tolist())


In [None]:
def create_model_xlnet(xlnet_model):
    word_inputs = tf.keras.Input(shape=(120,), dtype='int32', name='word_inputs')  # Ajusta el tamaño según tu configuración
    xlnet = TFXLNetModel.from_pretrained(xlnet_model)
    xlnet_encodings = xlnet(word_inputs)[0]
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    doc_encoding = tf.keras.layers.Dropout(0.1)(doc_encoding)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(({
    'word_inputs': train_encodings['input_ids']
}, train['phishing'].values)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices(({
    'word_inputs': test_encodings['input_ids']
}, test['phishing'].values)).batch(32)


In [None]:
model = create_model_xlnet(xlnet_model)
history = model.fit(train_dataset, validation_data=test_dataset, epochs=5)


In [None]:
# La historia de entrenamiento contiene los registros de precisión para el entrenamiento y la validación
accuracy_train = history.history['accuracy']
accuracy_val = history.history['val_accuracy']
epochs = range(1, len(accuracy_train) + 1)

# Ahora puedes crear el gráfico utilizando Matplotlib
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(epochs, accuracy_train, label='Training Accuracy')
plt.plot(epochs, accuracy_val, label='Validation Accuracy')
plt.title('XLnet Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model.evaluate(test_dataset)


In [None]:
test_predictions = model.predict(test_dataset)


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Asumiendo que test['phishing'].values son tus etiquetas verdaderas
y_verdadero = test['phishing'].values

# Calcular los valores de la curva ROC y el área bajo la curva (AUC)
fpr, tpr, umbrales = roc_curve(y_verdadero, test_predictions)
roc_auc = auc(fpr, tpr)

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.show()


In [None]:
print(roc_auc)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Extraer las etiquetas verdaderas del dataset de prueba
# Necesitas asegurarte de que 'test' tiene las etiquetas en el mismo orden que 'test_dataset'
y_true = np.concatenate([y for x, y in test_dataset], axis=0)

# Convertir las probabilidades predichas en etiquetas binarias
y_pred = np.round(test_predictions).astype(int).flatten()

# Calcular la matriz de confusión
conf_matrix = confusion_matrix(y_true, y_pred)


print("Matriz de Confusión:")
print(conf_matrix)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
predicted_labels = np.where(test_predictions > 0.5, 1, 0).flatten()
# Suponiendo que test['phishing'].values es un array numpy de etiquetas verdaderas
true_labels = test['phishing'].values

precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


In [None]:
# Definir las matrices de confusión
matrices = [
    [[164, 80], [47, 1662]],
    [[13, 915], [583, 6297]],
    [[130, 104], [55, 1664]],
    [[186, 58], [72, 1637]]
]

# Función para calcular TPR y FPR
def calcular_tpr_fpr(matrix):
    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    
    return tpr, fpr

# Calcular TPR y FPR para cada matriz
resultados = [calcular_tpr_fpr(matrix) for matrix in matrices]

resultados


In [None]:
model.save('XLNet_model.h5')