# Trabajo de Inteligencia artificial
 ## Análisis de noticias

 Realizado por:
 - Marta Aguilar Morcillo
 - Candela Jazmín Gutiérrez González

Fecha: 30/05/2025

Convocatoria de junio.

 ## 1. Lectura de datos

 Se comenzará con la lectura del corpus. Para ello, será necesaria la importación de las siguientes librerías:
 - **nltk:** 
 - **punkt_tab:** para la tokenización de las palabras de los documentos.
 - **contractions:**
 - **sklearn:**

In [104]:
!pip install nltk
import nltk

from nltk import download

download('punkt_tab')                           # Tokenización
nltk.download('averaged_perceptron_tagger')     # POS tagging
nltk.download('averaged_perceptron_tagger_eng') # POS tagging
nltk.download('wordnet')                        # WordNet lemmatizer
nltk.download('omw-1.4')                        # WordNet multilingüe



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [105]:
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from nltk.data import path
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
import numpy as np

path.append(".")

In [106]:
!pip install contractions
import contractions



In [107]:
import csv
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint
import re
from bs4 import MarkupResemblesLocatorWarning
import warnings

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
import spacy

In [109]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
!pip install whoosh
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh.query import Term, Or
import os
import shutil



In [110]:
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [111]:
palabras_vacias_ingles = stopwords.words('english')

In [112]:
nlp = spacy.load("en_core_web_sm")

In [113]:
def elimina_html(contenido):
    return BeautifulSoup(contenido).get_text()

def elimina_no_alfanumerico(contenido):
    return [re.sub(r'[^\w]', '', palabra)
            for palabra in contenido
            if re.search(r'\w', palabra)]

def expandir_constracciones(contenido):
    return contractions.fix(contenido)

def pasar_a_minuscula(contenido):
    return contenido.lower()

def limpiar_texto(texto):
    texto = re.sub(r'[^a-zA-Z\s]', ' ', texto)  # Reemplaza todo lo que no es letra o espacio con espacio
    texto = re.sub(r'\s+', ' ', texto).strip()
    return texto

def elimina_palabras_vacias(contenido):
    return [palabra for palabra in contenido if palabra not in palabras_vacias_ingles]

def lematizador(contenido):
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(contenido)

    resultado = []
    for palabra, tag in pos_tags:
        if tag.startswith('VB'):  # Verbos
            resultado.append(lemmatizer.lemmatize(palabra, pos='v'))  # infinitivo
        else:  # Sustantivos y el resto tal como están
            resultado.append(palabra)

    return resultado

def extraer_noun_chunks(tokens):
    resultados = []
    doc = nlp(" ".join(tokens))
    
    noun_chunks = [chunk.text.lower().strip() for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]
    noun_chunks_set = set(noun_chunks)

    i = 0
    while i < len(tokens):
        composed2 = " ".join(tokens[i:i+2]).lower()
        composed3 = " ".join(tokens[i:i+3]).lower()

        if composed3 in noun_chunks_set:
            i += 3  
        elif composed2 in noun_chunks_set:
            i += 2 
        else:
            resultados.append(tokens[i].lower())  
            i += 1

    return resultados + noun_chunks


In [114]:
def proceso_contenido(texto):
    texto = elimina_html(texto)
    texto = expandir_constracciones(texto)
    texto = pasar_a_minuscula(texto)
    texto = limpiar_texto(texto)                # Limpiar antes de tokenizar
    tokens = word_tokenize(texto)               
    tokens = elimina_no_alfanumerico(tokens)    # Limpiar tokens individuales
    tokens = elimina_palabras_vacias(tokens)
    tokens = lematizador(tokens)
    return tokens

In [115]:
def lectura_normalizada_corpus():
    df = pd.read_csv("news_corpus.csv", encoding="latin-1", sep=";", quotechar='"')
    resultados = []

    for index, fila in df.iterrows():
        autor = [fila.iloc[0]]
        titulo = fila.iloc[1]
        cuerpo = fila.iloc[2]   

        titulo_proc = proceso_contenido(titulo)
        cuerpo_proc = proceso_contenido(cuerpo)

        # Unir las tres listas en una sola lista combinada
        fila_combinada =  autor + titulo_proc + cuerpo_proc

        contenido_final = extraer_noun_chunks(fila_combinada)
        resultados.append(contenido_final)
    return resultados

def expand_term(term):
    related = set()
    for syn in wn.synsets(term):
        for lemma in syn.lemmas():
            word = lemma.name().replace('_', ' ').lower()
            if word != term:
                related.add(word)
    return related

def expand_corpus_with_synonyms(documento):
    doc_counter = Counter(documento)
    expanded_doc = []
    for word, count in doc_counter.items():
        # Añadimos la palabra original tantas veces como aparece
        expanded_doc.extend([word] * count)
        # Obtenemos sinónimos y también los añadimos con la misma frecuencia
        synonyms = expand_term(word)
        for syn in synonyms:
            expanded_doc.extend([syn] * count)
    return expanded_doc

def lectura_normalizada_corpus_sinonimos():
    df = pd.read_csv("news_corpus.csv", encoding="latin-1", sep=";", quotechar='"')
    resultados = []

    for index, fila in df.iterrows():
        autor = [fila.iloc[0]]
        titulo = fila.iloc[1]
        cuerpo = fila.iloc[2]   

        titulo_proc = proceso_contenido(titulo)
        cuerpo_proc = proceso_contenido(cuerpo)

        # Unir las tres listas en una sola lista combinada
        fila_combinada =  autor + titulo_proc + cuerpo_proc

        contenido_con_palabras_compuestas = extraer_noun_chunks(fila_combinada)
        contenido_final = expand_corpus_with_synonyms(contenido_con_palabras_compuestas)

        resultados.append(contenido_final)
    return resultados

def crear_indice_whoosh(corpus_normalizado):
    if os.path.exists("indice_whoosh"):
        shutil.rmtree("indice_whoosh")  # Elimina índice anterior si existe
    os.mkdir("indice_whoosh")

    schema = Schema(id=ID(stored=True, unique=True), contenido=TEXT(stored=True))
    ix = create_in("indice_whoosh", schema)
    writer = ix.writer()

    for i, doc in enumerate(corpus_normalizado):
        contenido = " ".join(doc)
        writer.add_document(id=str(i), contenido=contenido)
    
    writer.commit()
    return ix


def buscar_con_whoosh(tokens_query, ix, corpus_normalizado):
    with ix.searcher() as searcher:
        # Crear lista de términos para hacer OR
        terms = [Term("contenido", token) for token in tokens_query]
        query = Or(terms)
        
        resultados = searcher.search(query, limit=None)
        
        docs_encontrados = []
        for hit in resultados:
            doc_id = int(hit['id'])
            docs_encontrados.append(corpus_normalizado[doc_id])
        
        return docs_encontrados



In [128]:
def lectura_documento(documento):
    documento_procesado = proceso_contenido(documento) 
    contenido_final = extraer_noun_chunks(documento_procesado)
    return contenido_final

def lectura_documento_sinonimos(documento):
    documento_procesado = proceso_contenido(documento) 
    contenido_final = extraer_noun_chunks(documento_procesado)
    resultado = expand_corpus_with_synonyms(contenido_final)
    return resultado

In [117]:
# Mostrar los primeros 3 documentos procesados
def prueba_primeros_3_documentos_procesados(corpus):
    for i, documento in enumerate(corpus[:3]):
        print(f"Documento {i+1}:")
        print(" - Palabras:", documento)
        print()

In [118]:
def tfidf_del_documento(documento_normalizado, vectorizer):
    # Convertir el documento (lista de tokens) a string
    texto = " ".join(documento_normalizado)
    # Transformar usando el vectorizador ya entrenado
    X_doc = vectorizer.transform([texto])  # devuelve matriz sparse 1xN
    return X_doc

In [119]:
def prueba_tfdifs_primeros_3_documentos(lista_diccionarios_tfidfs):
    for idx, d in enumerate(lista_diccionarios_tfidfs[:3]):
        top_terms = sorted(d.items(), key=lambda x: x[1], reverse=True)
        print(f"\n Documento {idx+1}:")
        print("      Palabras añadidas por TF-IDF:")
        for term, score in top_terms:
            print(f"      - {term}: {score:.4f}")

In [120]:
def tfidf_por_corpus(corpus_normalizado):
    # Convertimos el corpus a lista de strings
    texts = [" ".join(doc) for doc in corpus_normalizado]

    # Vectorizador TF-IDF (1 y 2-gramas)
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X = vectorizer.fit_transform(texts)  # Matriz TF-IDF sparse
    terms = vectorizer.get_feature_names_out()

    # Retornamos la matriz y vocabulario (términos)
    return X, terms, vectorizer

In [131]:
def similitud_coseno(tfidf_corpus, tfidf_doc, umbral=0.0):
    # Calcular similitud coseno entre documento y corpus
    similitudes = cosine_similarity(tfidf_doc, tfidf_corpus).flatten()

    # Filtrar documentos que superan el umbral
    indices_filtrados = [i for i, sim in enumerate(similitudes) if sim > umbral]

    # Ordenar índices por similitud descendente
    indices_ordenados = sorted(indices_filtrados, key=lambda i: similitudes[i], reverse=True)

    # Devolver lista de (indice, similitud)
    return [(i, similitudes[i]) for i in indices_ordenados]

def documentos_similares(lista_similitudes):
    lista_similitudes[0]
    for idx, score in lista_similitudes:
        print(f"Documento {idx} tiene similitud: {score:.4f}")
    

In [122]:
def average_precision(resultados_ordenados, relevantes):
    hits = 0
    sum_precisions = 0
    for i, (idx, _) in enumerate(resultados_ordenados):
        if idx in relevantes:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / len(relevantes) if relevantes else 0

def mean_average_precision(consultas, relevantes_por_consulta, vectorizer, tfidf_corpus):
    all_ap = []
    for i, consulta in enumerate(consultas):
        tfidf_query = tfidf_del_documento(consulta, vectorizer)
        resultados = similitud_coseno(tfidf_corpus, tfidf_query)
        ap = average_precision(resultados, relevantes_por_consulta[i])
        all_ap.append(ap)
        print(f"Consulta {i+1} - AP: {ap:.4f}")
    return sum(all_ap) / len(all_ap)


In [123]:
# Prueba de lectura 
resultados = lectura_normalizada_corpus()
prueba_primeros_3_documentos_procesados(resultados)

Documento 1:
 - Palabras: ['diu', 'revoke', 'mandatory', 'rakshabandhan', 'offices', 'order', 'daman', 'wednesday', 'withdraw', 'circular', 'ask', 'tie', 'rakhis', 'male', 'colleagues', 'order', 'trigger', 'backlash', 'employees', 'rip', 'apart', 'social', 'media', 'union', 'territory', 'administration', 'force', 'retreat', 'within', 'circular', 'make', 'celebrate', 'decide', 'celebrate', 'festival', 'rakshabandhan', 'august', 'connection', 'offices', 'departments', 'shall', 'remain', 'collectively', 'suitable', 'time', 'wherein', 'shall', 'tie', 'rakhis', 'colleagues', 'order', 'issue', 'august', 'gurpreet', 'singh', 'deputy', 'secretary', 'personnel', 'say', 'ensure', 'one', 'skipped', 'office', 'attendance', 'report', 'send', 'government', 'next', 'one', 'mandate', 'celebration', 'rakshabandhan', 'leave', 'withdraw', 'mandate', 'daman', 'day', 'apart', 'circular', 'withdrawn', 'one', 'line', 'order', 'issue', 'late', 'evening', 'ut', 'department', 'personnel', 'administrative', 'ref

In [124]:
documento = "Union minister for transport and shipping Nitin Gadkari has demanded that the Maharashtra government provide special facilities to those detained during the Emergency under the Maintenance of Internal Security Act (MISA). The MISA detainees on?Sunday held a convention under the banner of Satyagrahi Sangh in Nagpur after meeting Gadkari at his Mahal residence. The group, led by the Sangh?s vice-president Sacchidanand Upasane, national secretary Komal Chheda and Maharashtra unit chief Jayprakash Pande insisted that the detainees be treated as freedom fighters and be given all facilities available to a freedom fighter in the country. Incidentally, most of the MISA detainees are RSS activists or its supporters.The group submitted a memorandum to Gadkari, briefing him about the facilities and recognition extended to detainees in states like UP, MP, Bihar and Chhattisgarh. They claimed that in these states, the detainees are considered at par with the freedom fighters and given pensions and other facilities. The government in Rajasthan recently formed a committee to study facilities available to MISA and Defence of India Rules 1971 detainees in other states. The committee will its report to the state soon, they claimed. The group also pointed out that in Madhya Pradesh, MISA detainees, dubbed as ?Democracy Warrior? draw a monthly honorarium of Rs 25,000. Gadkari told the group he had already spoken to chief minister Devendra Fadnavis and the state finance minister, Sudhir Munganttiwar, in this regard. ?Both were positive on the issue.? The Congress, meanwhile, has accused the BJP-led government of trying to give such facilities to its Sangh Parivar members. ?The move is totally a political one,? says former Union minister Vilas Muttemwar. Muttemwar warned his party would launch a statewide agitation if government accepted the group?s demand.?How can these Sangh Parivar members be compared with freedom fighters??"
lectura = lectura_documento(documento)
print(" - Palabras:", lectura)

 - Palabras: ['union', 'minister', 'transport', 'ship', 'nitin', 'gadkari', 'demand', 'maharashtra', 'government', 'provide', 'special', 'facilities', 'detain', 'emergency', 'maintenance', 'internal', 'security', 'act', 'misa', 'misa', 'detainees', 'sunday', 'hold', 'sangh', 'nagpur', 'meeting', 'gadkari', 'mahal', 'residence', 'group', 'lead', 'sangh', 'sacchidanand', 'upasane', 'national', 'secretary', 'komal', 'chief', 'jayprakash', 'pande', 'insist', 'detainees', 'treat', 'give', 'facilities', 'available', 'freedom', 'fighter', 'country', 'incidentally', 'misa', 'detainees', 'rss', 'activists', 'supporters', 'group', 'submit', 'memorandum', 'gadkari', 'brief', 'facilities', 'recognition', 'extend', 'like', 'mp', 'states', 'detainees', 'consider', 'par', 'give', 'pensions', 'facilities', 'government', 'rajasthan', 'recently', 'form', 'available', 'misa', 'defence', 'india', 'rules', 'committee', 'report', 'state', 'soon', 'claim', 'group', 'also', 'point', 'madhya', 'pradesh', 'misa

In [125]:
# Prueba de similitud sin sinonimos
ix = crear_indice_whoosh(corpus_normalizado)
documento = "Union minister for transport and shipping Nitin Gadkari has demanded that the Maharashtra government provide special facilities to those detained during the Emergency under the Maintenance of Internal Security Act (MISA). The MISA detainees on?Sunday held a convention under the banner of Satyagrahi Sangh in Nagpur after meeting Gadkari at his Mahal residence. The group, led by the Sangh?s vice-president Sacchidanand Upasane, national secretary Komal Chheda and Maharashtra unit chief Jayprakash Pande insisted that the detainees be treated as freedom fighters and be given all facilities available to a freedom fighter in the country. Incidentally, most of the MISA detainees are RSS activists or its supporters.The group submitted a memorandum to Gadkari, briefing him about the facilities and recognition extended to detainees in states like UP, MP, Bihar and Chhattisgarh. They claimed that in these states, the detainees are considered at par with the freedom fighters and given pensions and other facilities. The government in Rajasthan recently formed a committee to study facilities available to MISA and Defence of India Rules 1971 detainees in other states. The committee will its report to the state soon, they claimed. The group also pointed out that in Madhya Pradesh, MISA detainees, dubbed as ?Democracy Warrior? draw a monthly honorarium of Rs 25,000. Gadkari told the group he had already spoken to chief minister Devendra Fadnavis and the state finance minister, Sudhir Munganttiwar, in this regard. ?Both were positive on the issue.? The Congress, meanwhile, has accused the BJP-led government of trying to give such facilities to its Sangh Parivar members. ?The move is totally a political one,? says former Union minister Vilas Muttemwar. Muttemwar warned his party would launch a statewide agitation if government accepted the group?s demand.?How can these Sangh Parivar members be compared with freedom fighters??"
corpus_normalizado = lectura_normalizada_corpus()
lectura = lectura_documento(documento)
tfidf_corpus, terms, vectorizer = tfidf_por_corpus(corpus_normalizado)
tfidf_documento = tfidf_del_documento(lectura, vectorizer)
resultados_similitud = similitud_coseno(tfidf_corpus, tfidf_documento)
documentos_similares(resultados_similitud)

Documento 147 tiene similitud: 0.9871
Documento 112 tiene similitud: 0.0555
Documento 19 tiene similitud: 0.0550
Documento 109 tiene similitud: 0.0546
Documento 123 tiene similitud: 0.0494
Documento 119 tiene similitud: 0.0489
Documento 71 tiene similitud: 0.0416
Documento 4 tiene similitud: 0.0391
Documento 136 tiene similitud: 0.0387
Documento 137 tiene similitud: 0.0382
Documento 82 tiene similitud: 0.0376
Documento 75 tiene similitud: 0.0372
Documento 142 tiene similitud: 0.0370
Documento 125 tiene similitud: 0.0368
Documento 113 tiene similitud: 0.0362
Documento 105 tiene similitud: 0.0340
Documento 103 tiene similitud: 0.0338
Documento 104 tiene similitud: 0.0319
Documento 0 tiene similitud: 0.0299
Documento 106 tiene similitud: 0.0298
Documento 42 tiene similitud: 0.0296
Documento 94 tiene similitud: 0.0291
Documento 44 tiene similitud: 0.0278
Documento 128 tiene similitud: 0.0278
Documento 39 tiene similitud: 0.0275
Documento 146 tiene similitud: 0.0273
Documento 120 tiene simi

In [138]:
corpus_normalizado = lectura_normalizada_corpus()
ix = crear_indice_whoosh(corpus_normalizado)
documento = "Days after Maharashtra Women and Child Development Minister Pankaja Munde told the state Assembly that 42 children went missin in the last three years from a remand home in Mumbai, the Bombay High Court today summoned department's secretary to appear in person.The high court today came down heavily on the Maharashtra government after an affidavit was filed by Women and Child Development Secretary Vineeta Singhal in a case of corruption in the Dongri Children's Home of Mumbai. The affidavit does not mention anything about the action that the government has taken against accused persons in the Dongri Children's Home case. The high court has directed Vineeta Singhal to appear in person next week to explain government's stand in the case.A sting operation had shown corruption in the Mumbai Children's Home. The division bench of justice R M Savant and Justice Sadhna Jadhav noted that from reading the affidavit, we gather that it has only been filed to comply with our orders. We expected some pro-active action from state government.PIL AND THE CASE A letter had been sent by a former chief justice of Kolkata High Court who had spoken about the disturbing sting after which it was converted into a petition by the Bombay High Court. The bench had sent a notice to the government, which filed its response today.As many as 42 juveniles had fled the Dongri remand home located in the city in the last three years. The matter was raised in the Maharashtra Assembly. There were also reports that juveniles were being kept in the children's home in violation of norms.In a written response to a question by Pravin Darekar (BJP), Pankaja Munde said that from 2014-15 to 2016-17, 42 children had fled the Dongri remand home. Out of those, 16 were traced and brought back and one security official was suspended.GOVERNMENT'S REPLY IN COURTThe Bombay High Court sought to know if any action had been taken by the state government against those who were shown indulging in malpractice. The affidavit only mentioned that a show cause notice was sent to one person while the guard of the children's home was suspended. When public prosecutor Arajakta Shinde told the court that an inquiry was going on, Justice Savant asked, What kind of inquiry happens we all know. Assurance is not going to help.Justice Jadhav said, We are not satisfied with the manner in which this affidavit has been filed. We expect some seriousness. Therefore, we directed the secretary to file the affidavit. The bench was furious with the government's response. The court said that it was not a matter of only suspicion as a CD of the corruption and other materials had already come out in public domain. A process of cleansing should have started by now, the bench said.The Bombay High Court directed the Women and Child Development Secretary to be appear in person before the court in the next hearing next week.ALSO READ | Mumbai teen kills self, could be first Indian case of Blue Whale suicide challenge"
lectura = lectura_documento(documento)
docs_relevantes = buscar_con_whoosh(lectura, ix, corpus_normalizado)
prueba_primeros_3_documentos_procesados(docs_relevantes)
# tfidf_corpus, terms, vectorizer = tfidf_por_corpus(docs_relevantes)
# tfidf_documento = tfidf_del_documento(lectura, vectorizer)
# resultados_similitud = similitud_coseno(tfidf_corpus, tfidf_documento)
# documentos_similares(resultados_similitud)


Documento 1:
 - Palabras: ['parmeet kaur', 'hc', 'summons', 'wcd', 'flee', 'remand', 'home', 'days', 'maharashtra', 'women', 'child', 'development', 'minister', 'pankaja', 'munde', 'tell', 'go', 'missin', 'last', 'three', 'years', 'remand', 'home', 'mumbai', 'bombay', 'high', 'court', 'today', 'appear', 'today', 'come', 'heavily', 'maharashtra', 'government', 'affidavit', 'file', 'women', 'child', 'development', 'secretary', 'vineeta', 'singhal', 'case', 'corruption', 'dongri', 'children', 'home', 'mumbai', 'affidavit', 'mention', 'take', 'dongri', 'children', 'home', 'case', 'high', 'court', 'direct', 'vineeta', 'singhal', 'appear', 'person', 'next', 'week', 'explain', 'government', 'stand', 'case', 'sting', 'operation', 'show', 'corruption', 'mumbai', 'children', 'home', 'division', 'bench', 'justice', 'r', 'savant', 'justice', 'sadhna', 'jadhav', 'note', 'read', 'affidavit', 'gather', 'file', 'comply', 'orders', 'expect', 'pro', 'active', 'action', 'state', 'government', 'pil', 'cas

In [129]:
# Prueba de tfidfs con sinonimos
ix = crear_indice_whoosh(corpus_normalizado)
documento = "Union minister for transport and shipping Nitin Gadkari has demanded that the Maharashtra government provide special facilities to those detained during the Emergency under the Maintenance of Internal Security Act (MISA). The MISA detainees on?Sunday held a convention under the banner of Satyagrahi Sangh in Nagpur after meeting Gadkari at his Mahal residence. The group, led by the Sangh?s vice-president Sacchidanand Upasane, national secretary Komal Chheda and Maharashtra unit chief Jayprakash Pande insisted that the detainees be treated as freedom fighters and be given all facilities available to a freedom fighter in the country. Incidentally, most of the MISA detainees are RSS activists or its supporters.The group submitted a memorandum to Gadkari, briefing him about the facilities and recognition extended to detainees in states like UP, MP, Bihar and Chhattisgarh. They claimed that in these states, the detainees are considered at par with the freedom fighters and given pensions and other facilities. The government in Rajasthan recently formed a committee to study facilities available to MISA and Defence of India Rules 1971 detainees in other states. The committee will its report to the state soon, they claimed. The group also pointed out that in Madhya Pradesh, MISA detainees, dubbed as ?Democracy Warrior? draw a monthly honorarium of Rs 25,000. Gadkari told the group he had already spoken to chief minister Devendra Fadnavis and the state finance minister, Sudhir Munganttiwar, in this regard. ?Both were positive on the issue.? The Congress, meanwhile, has accused the BJP-led government of trying to give such facilities to its Sangh Parivar members. ?The move is totally a political one,? says former Union minister Vilas Muttemwar. Muttemwar warned his party would launch a statewide agitation if government accepted the group?s demand.?How can these Sangh Parivar members be compared with freedom fighters??"
corpus_normalizado = lectura_normalizada_corpus()
lectura = lectura_documento_sinonimos(documento)
docs_relevantes = buscar_con_whoosh(lectura, ix, corpus_normalizado)
tfidf_corpus, terms, vectorizer = tfidf_por_corpus(docs_relevantes)
tfidf_documento = tfidf_del_documento(lectura, vectorizer)
resultados_similitud = similitud_coseno(tfidf_corpus, tfidf_documento)
documentos_similares(resultados_similitud)

Documento 0 tiene similitud: 0.4006
Documento 38 tiene similitud: 0.1051
Documento 5 tiene similitud: 0.0878
Documento 1 tiene similitud: 0.0853
Documento 3 tiene similitud: 0.0851
Documento 8 tiene similitud: 0.0794
Documento 2 tiene similitud: 0.0787
Documento 9 tiene similitud: 0.0775
Documento 45 tiene similitud: 0.0774
Documento 10 tiene similitud: 0.0773
Documento 49 tiene similitud: 0.0749
Documento 6 tiene similitud: 0.0722
Documento 43 tiene similitud: 0.0707
Documento 13 tiene similitud: 0.0693
Documento 26 tiene similitud: 0.0691
Documento 7 tiene similitud: 0.0687
Documento 24 tiene similitud: 0.0687
Documento 18 tiene similitud: 0.0684
Documento 44 tiene similitud: 0.0619
Documento 89 tiene similitud: 0.0611
Documento 19 tiene similitud: 0.0579
Documento 36 tiene similitud: 0.0579
Documento 37 tiene similitud: 0.0576
Documento 48 tiene similitud: 0.0550
Documento 59 tiene similitud: 0.0534
Documento 4 tiene similitud: 0.0531
Documento 60 tiene similitud: 0.0528
Documento 1

In [127]:
# Prueba de tfidfs con sinonimos
ix = crear_indice_whoosh(corpus_normalizado)
documento = "Union minister for transport and shipping Nitin Gadkari has demanded that the Maharashtra government provide special facilities to those detained during the Emergency under the Maintenance of Internal Security Act (MISA). The MISA detainees on?Sunday held a convention under the banner of Satyagrahi Sangh in Nagpur after meeting Gadkari at his Mahal residence. The group, led by the Sangh?s vice-president Sacchidanand Upasane, national secretary Komal Chheda and Maharashtra unit chief Jayprakash Pande insisted that the detainees be treated as freedom fighters and be given all facilities available to a freedom fighter in the country. Incidentally, most of the MISA detainees are RSS activists or its supporters.The group submitted a memorandum to Gadkari, briefing him about the facilities and recognition extended to detainees in states like UP, MP, Bihar and Chhattisgarh. They claimed that in these states, the detainees are considered at par with the freedom fighters and given pensions and other facilities. The government in Rajasthan recently formed a committee to study facilities available to MISA and Defence of India Rules 1971 detainees in other states. The committee will its report to the state soon, they claimed. The group also pointed out that in Madhya Pradesh, MISA detainees, dubbed as ?Democracy Warrior? draw a monthly honorarium of Rs 25,000. Gadkari told the group he had already spoken to chief minister Devendra Fadnavis and the state finance minister, Sudhir Munganttiwar, in this regard. ?Both were positive on the issue.? The Congress, meanwhile, has accused the BJP-led government of trying to give such facilities to its Sangh Parivar members. ?The move is totally a political one,? says former Union minister Vilas Muttemwar. Muttemwar warned his party would launch a statewide agitation if government accepted the group?s demand.?How can these Sangh Parivar members be compared with freedom fighters??"
corpus_normalizado = lectura_normalizada_corpus_sinonimos()
lectura = lectura_documento(documento)
docs_relevantes = buscar_con_whoosh(lectura, ix, corpus_normalizado)
tfidf_corpus, terms, vectorizer = tfidf_por_corpus(docs_relevantes)
tfidf_documento = tfidf_del_documento(lectura, vectorizer)
resultados_similitud = similitud_coseno(tfidf_corpus, tfidf_documento)
documentos_similares(resultados_similitud)

Documento 0 tiene similitud: 0.3828
Documento 13 tiene similitud: 0.0569
Documento 9 tiene similitud: 0.0525
Documento 10 tiene similitud: 0.0452
Documento 6 tiene similitud: 0.0447
Documento 8 tiene similitud: 0.0400
Documento 1 tiene similitud: 0.0372
Documento 12 tiene similitud: 0.0356
Documento 2 tiene similitud: 0.0344
Documento 22 tiene similitud: 0.0335
Documento 17 tiene similitud: 0.0327
Documento 15 tiene similitud: 0.0326
Documento 72 tiene similitud: 0.0315
Documento 19 tiene similitud: 0.0306
Documento 4 tiene similitud: 0.0291
Documento 18 tiene similitud: 0.0291
Documento 7 tiene similitud: 0.0286
Documento 16 tiene similitud: 0.0286
Documento 24 tiene similitud: 0.0286
Documento 35 tiene similitud: 0.0282
Documento 14 tiene similitud: 0.0275
Documento 3 tiene similitud: 0.0273
Documento 5 tiene similitud: 0.0266
Documento 26 tiene similitud: 0.0258
Documento 11 tiene similitud: 0.0247
Documento 21 tiene similitud: 0.0246
Documento 48 tiene similitud: 0.0239
Documento 6