In [2]:
import os
import sys
import django
from asgiref.sync import sync_to_async

# Add the parent directory to Python path
sys.path.append('/Users/adrianalarcon/Library/CloudStorage/GoogleDrive-alarcon.adrianc@gmail.com/My Drive/cibert_sab_dom_14_sep_2025/artificial_intelligence/repos/job_finder/lab2')

# Set the Django settings module
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'lab2.settings')

# Setup Django
django.setup()

# Now import Django components
from django.shortcuts import render, redirect
from job_finder.forms import DocumentForm
from job_finder.models import Document
import pandas as pd

print("Django setup complete!")

# Create async wrapper functions for Django ORM operations
@sync_to_async
def get_document_count():
    return Document.objects.count()

@sync_to_async
def get_all_documents():
    return list(Document.objects.all().values())

@sync_to_async
def create_document(document_file):
    return Document.objects.create(document=document_file)

print("Async Django functions created. Use await get_document_count() to query the database.")

Django setup complete!
Async Django functions created. Use await get_document_count() to query the database.


In [3]:
# Test the async Django functions
try:
    count = await get_document_count()
    print(f"Number of documents in database: {count}")
    
    # Get all documents
    documents = await get_all_documents()
    print(f"Documents: {documents}")
    
except Exception as e:
    print(f"Error: {e}")
    print("Make sure to run migrations if you haven't: python manage.py migrate")

Number of documents in database: 1
Documents: [{'id': 1, 'document': 'documents/Minimum_qualifications.docx', 'uploaded_at': datetime.datetime(2025, 9, 28, 15, 10, 31, 251174, tzinfo=datetime.timezone.utc)}]


In [4]:
def add_url(data):
        output = "<a href='"
        output = output + data
        output = output + "'>" + data + "</a>"
        return output

add_url("1")

"<a href='1'>1</a>"

In [5]:
# Install nest_asyncio if needed
!pip install nest_asyncio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import pickle

data = pickle.load(open("static/data/puestos.pickle", "rb"))
data

Unnamed: 0,PUESTO
0,Ejecutivo Ventas CALL CENTER C/EXP PRESENCIAL ...
1,Jefe de Selección del Talento
2,Administrador de Cuenta Comercial - Villa El S...
3,Analista Sr. Comunicación Organizacional
4,Mecanico Aeronautico - Latam Airlines Chile
...,...
12654,Asesor comercial - Campo
12655,Atención al Cliente Sin Experiencia Turno Tard...
12656,ANALISTA DE COSTOS
12657,Asistente Contable - Tributaciones


In [15]:
import re
def limpiar_texto(texto):
      texto = texto.lower()
      return re.sub(r'[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ /]', '', texto)

data["puesto_limpio"] = data["PUESTO"].apply(limpiar_texto)

In [16]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('spanish'))

data["puesto_tokens"] = data["puesto_limpio"].apply(word_tokenize)

data["puesto_tokens"] = data["puesto_tokens"].apply(lambda tokens: [t for t in tokens if t not in stop_words])

lemmatizer = WordNetLemmatizer()
data["puesto_lemmas"] = data["puesto_tokens"].apply(lambda tokens: [lemmatizer.lemmatize(t, pos = "v") for t in tokens])

data["puesto_final"] = data["puesto_lemmas"].apply(lambda tokens: ' '.join(tokens))

data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianalarcon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianalarcon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,PUESTO,puesto_limpio,puesto_tokens,puesto_lemmas,puesto_final
0,Ejecutivo Ventas CALL CENTER C/EXP PRESENCIAL ...,ejecutivo ventas call center c/exp presencial ...,"[ejecutivo, ventas, call, center, c/exp, prese...","[ejecutivo, ventas, call, center, c/exp, prese...",ejecutivo ventas call center c/exp presencial ...
1,Jefe de Selección del Talento,jefe de selección del talento,"[jefe, selección, talento]","[jefe, selección, talento]",jefe selección talento
2,Administrador de Cuenta Comercial - Villa El S...,administrador de cuenta comercial villa el sa...,"[administrador, cuenta, comercial, villa, salv...","[administrador, cuenta, comercial, villa, salv...",administrador cuenta comercial villa salvador
3,Analista Sr. Comunicación Organizacional,analista sr comunicación organizacional,"[analista, sr, comunicación, organizacional]","[analista, sr, comunicación, organizacional]",analista sr comunicación organizacional
4,Mecanico Aeronautico - Latam Airlines Chile,mecanico aeronautico latam airlines chile,"[mecanico, aeronautico, latam, airlines, chile]","[mecanico, aeronautico, latam, airlines, chile]",mecanico aeronautico latam airlines chile


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["puesto_final"])
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 52894 stored elements and shape (12659, 4881)>

In [19]:
pickle.dump(vectorizer, open("static/data/vectorizer.pickle", "wb"))

In [20]:
from preprocessing import preprocess_text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianalarcon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianalarcon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
data = pickle.load(open("static/data/puestos.pickle", "rb"))
job_vect, vect, job_processed = preprocess_text(data, "PUESTO")
job_vect

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 52369 stored elements and shape (12659, 4881)>

In [32]:
import docx
def word_to_text(filename):
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        return '\n'.join(fullText)

filename = "../documents/Carlos Adrián Alarcón Delgado alarcon.docx"
cv_text = word_to_text(filename)
cv_df = pd.DataFrame([cv_text], columns=["cv"])

cv_vect, vect, df_processed = preprocess_text(cv_df, "cv")
cv_vect

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 108 stored elements and shape (1, 4881)>

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(cv_vect, job_vect)
a = pd.Series(similarity.flatten()).sort_values(ascending=False).head(10)
data.iloc[a.index,:]

Unnamed: 0,PUESTO,PUESTO_limpio,PUESTO_limpio_tokens,PUESTO_limpio_sin_stopwords,PUESTO_limpio_lematizado,PUESTO_limpio_final
7128,COORDINADOR ANALYTICS Y BIG DATA,coordinador analytics y big data,"[coordinador, analytics, y, big, data]","[coordinador, analytics, big, data]","[coordinador, analytics, big, data]",coordinador analytics big data
11302,ANALISTA BI DATA ANALYTICS COMERCIAL- SQL AVAN...,analista bi data analytics comercial sql avanzado,"[analista, bi, data, analytics, comercial, sql...","[analista, bi, data, analytics, comercial, sql...","[analista, bi, data, analytics, comercial, sql...",analista bi data analytics comercial sql avanzado
5479,ANALISTA BI DATA ANALYTICS COMERCIAL- SQL AVAN...,analista bi data analytics comercial sql avanzado,"[analista, bi, data, analytics, comercial, sql...","[analista, bi, data, analytics, comercial, sql...","[analista, bi, data, analytics, comercial, sql...",analista bi data analytics comercial sql avanzado
3185,Senior Data Engineer (Big Data),senior data engineer big data,"[senior, data, engineer, big, data]","[senior, data, engineer, big, data]","[senior, data, engineer, big, data]",senior data engineer big data
11631,Data Management,data management,"[data, management]","[data, management]","[data, management]",data management
7001,Analista de de Business Analytics,analista de de business analytics,"[analista, de, de, business, analytics]","[analista, business, analytics]","[analista, business, analytics]",analista business analytics
5395,Data Scientist,data scientist,"[data, scientist]","[data, scientist]","[data, scientist]",data scientist
11498,Data Scientist,data scientist,"[data, scientist]","[data, scientist]","[data, scientist]",data scientist
7332,Gestor de cultura de Datos- experiencia en ven...,gestor de cultura de datos experiencia en vent...,"[gestor, de, cultura, de, datos, experiencia, ...","[gestor, cultura, datos, experiencia, ventas, ...","[gestor, cultura, datos, experiencia, ventas, ...",gestor cultura datos experiencia ventas soluci...
5397,Business Analytics Arequipa,business analytics arequipa,"[business, analytics, arequipa]","[business, analytics, arequipa]","[business, analytics, arequipa]",business analytics arequipa
