# Tokenizer

In [2]:
import os
import random 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, Flatten, Dropout, Input, GlobalMaxPooling1D, concatenate

In [3]:
df = pd.read_csv('data.csv')

In [4]:
def quit_space(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    return text

In [5]:
from sklearn.model_selection import train_test_split

df['Code1'] = df['Code1'].apply(quit_space)
df['Code2'] = df['Code2'].apply(quit_space)

x_features = df[['Code1', 'Code2']]
y_labels = df['Plagio']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_features, y_labels, test_size=0.2, random_state=42)

# Tokenización y padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['Code1'] + X_train['Code2'])

X_train_code1_sequences = tokenizer.texts_to_sequences(X_train['Code1'])
X_train_code2_sequences = tokenizer.texts_to_sequences(X_train['Code2'])
X_test_code1_sequences = tokenizer.texts_to_sequences(X_test['Code1'])
X_test_code2_sequences = tokenizer.texts_to_sequences(X_test['Code2'])

max_length = max(max(len(seq) for seq in X_train_code1_sequences), max(len(seq) for seq in X_train_code2_sequences))

X_train_code1_padded = pad_sequences(X_train_code1_sequences, maxlen=max_length, padding='post', truncating='post')
X_train_code2_padded = pad_sequences(X_train_code2_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_code1_padded = pad_sequences(X_test_code1_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_code2_padded = pad_sequences(X_test_code2_sequences, maxlen=max_length, padding='post', truncating='post')

print(max_length)


5417


In [6]:
from tensorflow.keras.models import Model

embedding_dim = 100
num_filters = 64
filter_sizes = [3, 4, 5]
dropout_rate = 0.5


input1 = Input(shape=(max_length,))
input2 = Input(shape=(max_length,))

embedding = Embedding(len(tokenizer.word_index) + 1, embedding_dim)

conv_blocks = []
for filter_size in filter_sizes:
    conv = Conv1D(filters=num_filters, kernel_size=filter_size, activation='relu')
    conv_block1 = conv(embedding(input1))
    conv_block2 = conv(embedding(input2))
    pool1 = GlobalMaxPooling1D()(conv_block1)
    pool2 = GlobalMaxPooling1D()(conv_block2)
    conv_blocks.extend([pool1, pool2])


if len(filter_sizes) > 1:
    merged = concatenate(conv_blocks)
else:
    merged = conv_blocks[0]

dropout = Dropout(0.5)(merged)
dense1 = Dense(64, activation='relu')(dropout)
output = Dense(1, activation='sigmoid')(dense1)

model = Model(inputs=[input1, input2], outputs=output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit([X_train_code1_padded, X_train_code2_padded], y_train, epochs=10, batch_size=64, validation_data=([X_test_code1_padded, X_test_code2_padded], y_test))

Epoch 1/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2s/step - acc: 0.7644 - loss: 0.4928 - val_acc: 0.9563 - val_loss: 0.1838
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2s/step - acc: 0.9056 - loss: 0.3524 - val_acc: 0.9563 - val_loss: 0.2024
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - acc: 0.9015 - loss: 0.3124 - val_acc: 0.9563 - val_loss: 0.1892
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2s/step - acc: 0.9120 - loss: 0.2739 - val_acc: 0.9563 - val_loss: 0.1798
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - acc: 0.9007 - loss: 0.3054 - val_acc: 0.9563 - val_loss: 0.1911
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - acc: 0.9162 - loss: 0.2549 - val_acc: 0.9563 - val_loss: 0.1787
Epoch 7/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - acc: 0.9053 - 

In [7]:
# Evaluat the model
loss, accuracy = model.evaluate([X_test_code1_padded, X_test_code2_padded], y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 172ms/step - acc: 0.9612 - loss: 0.1696
Loss: 0.17584112286567688
Accuracy: 0.9617486596107483


In [8]:
from sklearn.metrics import confusion_matrix

# Predicciones en los datos de prueba
y_pred = model.predict([X_test_code1_padded, X_test_code2_padded])
y_pred_classes = np.round(y_pred)

# Crear la matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred_classes)

print("Matriz de Confusión:")
print(conf_matrix)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 217ms/step
Matriz de Confusión:
[[175   0]
 [  7   1]]


In [14]:
import numpy as np
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, accuracy_score

# Convertir las probabilidades en clases predichas (ajusta según tu problema de clasificación)
y_pred_test = (y_pred > 0.5).astype("int32")  # Para problemas binarios

# Calcular F1 y Recall
f1 = f1_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)

# Para obtener un reporte completo
report = classification_report(y_test, y_pred_test)
print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       175
           1       1.00      0.12      0.22         8

    accuracy                           0.96       183
   macro avg       0.98      0.56      0.60       183
weighted avg       0.96      0.96      0.95       183



In [17]:
# Calcular F1 y Recall para el conjunto de prueba con promedio 'macro'
f1 = f1_score(y_test, y_pred_test, average='macro')
recall = recall_score(y_test, y_pred_test, average='macro')

# Imprimir las métricas
print("Macro F1 Score:", f1)
print("Macro Recall:", recall)


Macro F1 Score: 0.6013071895424836
Macro Recall: 0.5625
