<a href="https://colab.research.google.com/github/ariadna-guzman/E3-Deteccion-de-plagio-TC3002B.301/blob/main/Evidencia_2_Modelo_Mejorado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evidencia 2: Modelo Mejorado

- Ariadna Jocelyn Guzmán Jiménez A01749373
- Jorge Chávez Badillo A01749448
- Amy Murakami Tsutsumi A01750185


## Instalación de Librerías

In [23]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
!pip -q install sentencepiece

In [25]:
!pip -q install langdetect

In [26]:
!pip install Keras-Preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importación de Librerías

In [27]:
# Kit de herramientas de lenguaje natural
import nltk 
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import torch
# Para el cálculo de la distancia entre párrafos
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Para la manipulación y lectura de archivos
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# Para la visualización, análisis y manipulación de los datos 
import pandas as pd
from tqdm import tqdm 
# Para la creación de vectores y matrices
import numpy as np
from keras_preprocessing.sequence import pad_sequences
# Implementación de BERT
from transformers import BertTokenizer, AutoModelForSequenceClassification
# Para la traducción de textos
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect, DetectorFactory

## Importación de Textos

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
genuinos = '/content/drive/Shareddrives/Desarrollo de Aplicaciones Avanzadas de Ciencias Computacionales/Reto/Evidencia 2/2023-Datasets/Etapa1/documentos-genuinos/'
sospechosos = '/content/drive/Shareddrives/Desarrollo de Aplicaciones Avanzadas de Ciencias Computacionales/Reto/Evidencia 2/2023-Datasets/Etapa1/docmentos-sospechosos/'

### Creación de Data Frame

In [30]:
def create_frame(path):
  content = []
  names = []
  files_list = sorted(os.listdir(path))
  for file_name in files_list:
    with open(path + file_name, 'r') as file:
      data = file.read().rstrip()
      content.append(data)
      names.append(file_name)
  dictionary = {'name': names, 'content': content}
  df = pd.DataFrame(dictionary)
  return df

In [31]:
df_genuinos = create_frame(genuinos)
df_genuinos.head(5)

Unnamed: 0,name,content
0,org-300.txt,Automatic software plagiarism detection tools ...
1,org-301.txt,A casual comment by a student alerted the auth...
2,org-302.txt,Paraphrase types have been proposed by researc...
3,org-303.txt,This paper addresses the issue of text matchin...
4,org-304.txt,This work presents a Sentence Hashing Algorith...


## Implementación de Bert
***(Bidirectional Encoder Representations From Transformers)***

In [32]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case = True)

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           output_attentions = False,
                                                           output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Vectorización

#### Por texto

In [46]:
def vectorize_text(tokenizer, model, abstract, MAX_LEN = 510):
  inputs = tokenizer.encode(abstract, 
                            add_special_tokens = True, 
                            max_length = MAX_LEN,)    

  results = pad_sequences([inputs], 
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")
  inputs = results[0] # Eliminar lista exterior

  # Máscaras
  ''' for i in inputs:
    mask = int(i>0)'''
  mask = [int(i > 0) for i in inputs]

  # Tensores
  inputs = torch.tensor(inputs)
  mask = torch.tensor(mask)

  # Dimensión adicional para batch
  inputs = inputs.unsqueeze(0)
  mask = mask.unsqueeze(0)

  # Evaluación de modelo
  model.eval()

  # Recopílación de estado a través de BERT
  with torch.no_grad():        
        logits, encoded_layers = model(input_ids = inputs, 
                                        token_type_ids = None, 
                                        attention_mask = mask,
                                        return_dict = False)

  layer_i = 12 # Última capa BERT antes de clasificación
  batch_i = 0 # Entrada del batch
  token_i = 0 # Primer token

  # Extraer el vector
  vector = encoded_layers[layer_i][batch_i][token_i]

  # Numpy array
  vector = vector.detach().cpu().numpy()

  return vector

#### Base de Datos

In [47]:
def vectorize_database(data):
  vectors = []

  # Obtención de datos generales del contenido
  source = data.content.values

  # Recorrer texto para la obtención de los embeddings
  for content in tqdm(source):
    # Obtener el embedding
    vector = vectorize_text(tokenizer, model, content)
    vectors.append(vector)
  
  data['vectors'] = vectors
  data['vectors'] = data['vectors'].apply(lambda emb: np.array(emb))
  data['vectors'] = data['vectors'].apply(lambda emb: emb.reshape(1, -1))
  return data

### Creación de Vector para los Archivos Genuinos

In [48]:
vector_genuinos = vectorize_database(df_genuinos)

100%|██████████| 120/120 [04:00<00:00,  2.00s/it]


## Funciones para el Análisis de Similitud

### Lemmatizer

In [None]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.stem import WordNetLemmatizer

In [None]:
#Función para aplicar lematización en los párrafos
def lemm_parrafo(parrafo):
  lemmatizer = WordNetLemmatizer()
  parrafo_lemm = []

  for palabra in parrafo: 
    palabra_lemm = lemmatizer.lemmatize(palabra)
    parrafo_lemm.append(palabra_lemm)

  parrafo_str = ' '.join(parrafo_lemm)
  return parrafo_str

### Stemming

In [None]:
# Función para aplicar la herramienta de stemming a los párrafos
def stemm_parrafo(parrafo):
  stemmer = PorterStemmer()
  parrafo = parrafo.split()
  parrafo_stemm = []

  for palabra in parrafo:
    palabra_stem = stemmer.stem(palabra)
    parrafo_stemm.append(palabra_stem)

  parrafo_str = ' '.join(parrafo_stemm)
  return parrafo_str

## Conclusiones