In [None]:
import os
os.chdir("..")

In [None]:
os.environ['DJANGO_SETTINGS_MODULE'] = 'my_project.settings'
import django
django.setup()

In [None]:
from job_matcher.models import Document 
from asgiref.sync import sync_to_async
import pandas as pd
@sync_to_async
def fetch_documents():
    # Converting the queryset to a list forces the evaluation of the query
    return pd.DataFrame(list(Document.objects.all().values()))

# Now, inside an async cell or function, use await to get the results
documents = await fetch_documents()
documents["id"] = documents["id"].apply(str)
def add_url(data):
    return "<a href='" + data + "'>" + data + "</a>"

documents['id'] = documents['id'].apply(add_url)
documents.head()

In [None]:
### Smart part

import pickle
import nltk

jobs = pickle.load(open("job_matcher/static/data/puestos.pickle", "rb"))
jobs.head()

In [None]:
import re
from unidecode import unidecode

texto = "atención, ¿cómo estás?"
def clean_text(texto):
    text_ascii = unidecode(texto)
    result = re.sub(r'[^A-Za-z0-9 ]', '', text_ascii)
    result = result.lower()
    return result

jobs["PUESTO"] = jobs["PUESTO"].apply(clean_text)



In [None]:
### Tokenize

from nltk.tokenize import word_tokenize
jobs["tokens"] = jobs["PUESTO"].apply(word_tokenize)
jobs.head()

In [None]:
### Stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words('spanish'))
jobs["tokens"] = jobs["tokens"].apply(lambda x: [word for word in x if word not in stop_words])
jobs.head()

In [None]:
jobs["tokens"].loc[2]

In [None]:
### Lemmatization

import spacy

def lemmatize(text):
    nlp = spacy.load("es_core_news_sm")
    return nlp(text)[0].lemma_

jobs["lemmas"] = jobs["tokens"].apply(lambda x : [lemmatize(i) for i in x])

In [None]:
sample = jobs.sample(20)
sample["lemmas"] = sample["tokens"].apply(lambda x : [lemmatize(i) for i in x])

In [None]:
sample["final_text"] = sample["lemmas"].apply(lambda x : " ".join(x))


In [None]:
### Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
vect.fit_transform(sample["final_text"])