# Preprocess

In [1]:
import pandas as pd
df = pd.read_csv('ofertas.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ISCO,major_job,job,position,location,description
0,0,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist id54315,"Villigen PSI, Aargau",[' You have an academic degree in physics or e...
1,1,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Applied Physicist (Computing) (EP-LBC-2021-125...,Geneva,[' Be in charge of the development of applicat...
2,2,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist (BE-ABP-LNO-2021-122-LD)...,Geneva,[' Contribute to the maintenance and developme...
3,3,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Medical Devices Physicist,"Newton, Cambridgeshire",[' Agency: Newton Colmore Consulting Reference...
4,4,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Fluidics Physicist,Cambridge,[' Agency: Newton Colmore Consulting Reference...


In [2]:
df['advice'] = "major_job: "+ df['major_job'] + " job: " + df['job'] + "position: " + df['position'] + " description: " + df['description'] + " location: " + df['location']

In [3]:
df = df[['advice']]
df.head()

Unnamed: 0,advice
0,major_job: SCIENCE AND ENGINEERING PROFESSIONA...
1,major_job: SCIENCE AND ENGINEERING PROFESSIONA...
2,major_job: SCIENCE AND ENGINEERING PROFESSIONA...
3,major_job: SCIENCE AND ENGINEERING PROFESSIONA...
4,major_job: SCIENCE AND ENGINEERING PROFESSIONA...


In [4]:
df.to_csv('ofertas_procesadas.csv', index=False)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
import os
import regex
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt
import joblib
import es_core_news_sm
from string import punctuation
import spacy

In [16]:
nlp = spacy.load('en_core_web_sm')


In [17]:
def ConvertirAcentos(texto):
    texto=texto.replace("\xc3\xa1","á")
    texto=texto.replace("\xc3\xa9","é")   
    texto=texto.replace("\xc3\xad","í")
    texto=texto.replace("\xc3\xb3","ó")
    texto=texto.replace("\xc3\xba","ú")
    texto=texto.replace("\xc3\x81","Á")
    texto=texto.replace("\xc3\x89","É")
    texto=texto.replace("\xc3\x8d","Í")
    texto=texto.replace("\xc3\x93","Ó")
    texto=texto.replace("\xc3\x9a","Ú")
    texto=texto.replace("\xc3±","ñ")
    return(texto)

def CrearCorpusDesdeCSV(path_csv):
    df = pd.read_csv(path_csv)
    corpus = []
    doc_id = []

    for idx, row in df.iterrows():
        texto = row['advice'] 
        texto = ConvertirAcentos(texto)
        corpus.append(texto)
        doc_id.append(f"doc_{idx}") 
    
    return corpus, doc_id
     
def PreProcesar(textos):
    texto_limpio = []
    for texto in textos:  
        texto = EliminarStopwords(texto.lower())    
        texto = Lematizar(texto)     
        texto = EliminaNumeroYPuntuacion(texto)      
        if len(texto)!=0:
          texto = regex.sub(' +', ' ', texto)
          texto_limpio.append(texto)
    return(texto_limpio)

def Lematizar(oracion):
   doc = nlp(oracion)
   lemas = [token.lemma_ for token in doc]
   return(Lista_a_Oracion(lemas))  

def Lista_a_Oracion(Lista):
   return(" ".join(Lista))          

def EliminarStopwords(oracion):
    Tokens = Tokenizar(oracion)
    oracion_filtrada =[] 
    for palabra in Tokens:
       if palabra not in STOP_WORDS:
           palabra_limpia = palabra.rstrip()
           if len(palabra_limpia)!=0:
              oracion_filtrada.append(palabra_limpia) 
    return(Lista_a_Oracion(oracion_filtrada))

def Tokenizar(oracion):
    doc = nlp(oracion)
    tokens = [palabra.text for palabra in doc]
    return(tokens)

def EliminaNumeroYPuntuacion(oracion):
    string_numeros = regex.sub(r'[\”\“\¿\°\d+]','', oracion)
    return ''.join(c for c in string_numeros if c not in punctuation)



In [8]:
path_csv = "ofertas_procesadas.csv"  
corpus, doc_id = CrearCorpusDesdeCSV(path_csv)

In [18]:
textos  = PreProcesar(corpus)

In [19]:

def CrearVSM(textos,nombre_modelo,modelo_idf=True,modelo_binario=False):
  modelo  = TfidfVectorizer(use_idf=modelo_idf, 
                            norm=None, binary=modelo_binario)
  matriz_features  = modelo.fit_transform(textos)
  vocabulario      = modelo.vocabulary_
  dtm              = matriz_features.toarray()
  if (modelo_idf):
        idf = modelo.idf_
  else: 
        idf = []   
  GrabarModelo(nombre_modelo,dtm,idf,vocabulario)

def GrabarModelo(NombreModelo,modelo,idf,vocab):
   existe = os.path.isdir(NombreModelo)
   if not existe:
       os.mkdir(NombreModelo)
   joblib.dump(modelo,   NombreModelo +"/"+'tfidf.pkl') 
   joblib.dump(idf,   NombreModelo +"/"+'idf.pkl') 
   joblib.dump(vocab, NombreModelo +"/"+'vocab.pkl') 

def CargarModelo(NombreModelo):
    modelo = joblib.load(NombreModelo+"/"+'tfidf.pkl')
    idf   = joblib.load(NombreModelo+"/"+'idf.pkl')
    vocab  = joblib.load(NombreModelo+"/"+'vocab.pkl')
    return(modelo,idf,vocab)    
def crearQuery(terms,idf,vocabulario):
    query = np.zeros(len(vocabulario))
    listaTerminos = Tokenizar(Lematizar(terms))
    for t in listaTerminos:      
       try:
           indice = vocabulario[t]
           query[indice] = 1
       except KeyError:
           indice = -1
    if (np.count_nonzero(query) != 0):
              query = query * idf
              return(query)
    return([])
def RecuperarDocumentosRelevantes(query, modelo, doc_id):
    RelDocs = []
    for ind_doc in range(len(doc_id)):
        filename = doc_id[ind_doc]  
        similitud = 1 - cosine(query, modelo[ind_doc, :])
        RelDocs.append((similitud, filename))  
    
    RelDocs = sorted(RelDocs, reverse=True)
    
    return RelDocs[:10]
def MostrarDocumentos(Docs):
    print("Lista de documentos relevantes a la query:\n")
    for (sim,d) in Docs:
        print("Doc: "+d+" ("+str(sim)+")\n")

In [20]:
CrearVSM(textos,"Linkedin 2.0")


In [21]:
(tfidf, idf, vocabulario) = CargarModelo("Linkedin 2.0")


In [22]:
print("*********************************************")
print("        Bienvenido al Linkedin 2.0!")
print("*********************************************")

terms = input("Ingrese query: ")
vector_query = crearQuery(terms,idf,vocabulario)

if len(vector_query)==0:
    print("ERROR en vector de consulta, no se pueden recuperar documentos!..")
else:
    DocsRelevantes = RecuperarDocumentosRelevantes(vector_query,tfidf,doc_id)
    MostrarDocumentos(DocsRelevantes)

*********************************************
        Bienvenido al Linkedin 2.0!
*********************************************
Lista de documentos relevantes a la query:

Doc: doc_855 (0.3695868811191495)

Doc: doc_858 (0.2713244252385192)

Doc: doc_791 (0.2691165209284495)

Doc: doc_1494 (0.22826890423891955)

Doc: doc_1503 (0.22554330106802523)

Doc: doc_867 (0.2160271760722533)

Doc: doc_730 (0.2147765993455112)

Doc: doc_775 (0.20622795405800676)

Doc: doc_630 (0.19851277056298533)

Doc: doc_676 (0.1962362537563629)

