In [None]:
!pip install -U spacy

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd
import json
pd.set_option("display.max_rows", 400)
pd.set_option("display.max_colwidth", 400)

In [None]:
!python -m spacy download es_core_news_md

In [None]:
nlp = spacy.load('es_core_news_md')

In [None]:
with open("training_ids.json", "r", encoding="utf-8") as f:
    data1 = json.load(f)

In [None]:
with open("dev.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
# CONTEXTOS!!!
resultados = []
for item in data["data"]:
  contexto = item["context"]
  #id = item["id"]
  doc = nlp(contexto)
  #número de tokens
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
  total_tokens = len(tokens)
  pos_counts = Counter([t.pos_ for t in tokens])

  resultados.append({
      #"id": id,
      "total_palabras": total_tokens,
      "sustantivos (NOUN)": pos_counts.get("NOUN", 0),
      "adjetivos (ADJ)": pos_counts.get("ADJ", 0),
      "verbos (VERB)": pos_counts.get("VERB", 0),
      "adverbios (ADV)": pos_counts.get("ADV", 0)
  })

with open("tokens_context_dev.json", "w", encoding="utf-8") as f:
    json.dump(resultados, f, indent=4, ensure_ascii=False)

df = pd.DataFrame(resultados)
# print(df)

#Calcular medias:
media_palabras = df["total_palabras"].mean()
media_sustantivos = df["sustantivos (NOUN)"].mean()
media_adjetivos = df["adjetivos (ADJ)"].mean()
media_verbos = df["verbos (VERB)"].mean()
media_adverbios = df["adverbios (ADV)"].mean()


print("\n--- Medias ---")
print(f"Media de total de palabras: {media_palabras}")
print(f"Media de sustantivos (NOUN): {media_sustantivos:.2f}")
print(f"Media de adjetivos (ADJ): {media_adjetivos:.2f}")
print(f"Media de verbos (VERB): {media_verbos:.2f}")
print(f"Media de adverbios (ADV): {media_adverbios:.2f}")

# Máximo número de tokens y mínimo:
max_idx = df["total_palabras"].idxmax()
min_idx = df["total_palabras"].idxmin()

print("\n--- Contexto con más palabras ---")
print(f"Índice: {max_idx}, Total de palabras: {df.loc[max_idx, 'total_palabras']}")
print("Contexto:", data["data"][max_idx]["context"])

print("\n--- Contexto con menos palabras ---")
print(f"Índice: {min_idx}, Total de palabras: {df.loc[min_idx, 'total_palabras']}")
print("Contexto:", data["data"][min_idx]["context"])

In [None]:
# PREGUNTAS!!!
resultadosQuest = []
for item in data["data"]:
  pregunta = item["question"]
  #id = item["id"]
  doc = nlp(pregunta)
  #número de tokens
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
  total_tokens = len(tokens)
  pos_counts = Counter([t.pos_ for t in tokens])

  resultadosQuest.append({
      #"id": id,
      "total_palabras": total_tokens,
      "sustantivos (NOUN)": pos_counts.get("NOUN", 0),
      "adjetivos (ADJ)": pos_counts.get("ADJ", 0),
      "verbos (VERB)": pos_counts.get("VERB", 0),
      "adverbios (ADV)": pos_counts.get("ADV", 0)
  })

with open("tokens_question_training.json", "w", encoding="utf-8") as f:
    json.dump(resultadosQuest, f, indent=4, ensure_ascii=False)

df = pd.DataFrame(resultadosQuest)
# print(df)

#Calcular medias:
media_palabras = df["total_palabras"].mean()
media_sustantivos = df["sustantivos (NOUN)"].mean()
media_adjetivos = df["adjetivos (ADJ)"].mean()
media_verbos = df["verbos (VERB)"].mean()
media_adverbios = df["adverbios (ADV)"].mean()


print("\n--- Medias ---")
print(f"Media de total de palabras: {media_palabras}")
print(f"Media de sustantivos (NOUN): {media_sustantivos:.2f}")
print(f"Media de adjetivos (ADJ): {media_adjetivos:.2f}")
print(f"Media de verbos (VERB): {media_verbos:.2f}")
print(f"Media de adverbios (ADV): {media_adverbios:.2f}")

# Máximo número de tokens y mínimo:
max_idx = df["total_palabras"].idxmax()
min_idx = df["total_palabras"].idxmin()

print("\n--- Contexto con más palabras ---")
print(f"Índice: {max_idx}, Total de palabras: {df.loc[max_idx, 'total_palabras']}")
print("Contexto:", data["data"][max_idx]["context"])

print("\n--- Contexto con menos palabras ---")
print(f"Índice: {min_idx}, Total de palabras: {df.loc[min_idx, 'total_palabras']}")
print("Contexto:", data["data"][min_idx]["context"])