In [1]:
import json
import numpy as np
import nltk.data
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.optimizers import Adam,SGD
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from tqdm import tqdm
import pickle
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib.colors import ListedColormap
from sklearn.model_selection import GridSearchCV
import random

#importing all the necessary lib



ModuleNotFoundError: No module named 'textblob'

In [None]:
cranfield_docs_url = "/kaggle/input/cransfield/cran_docs.json"
cranfield_queries_url = "/kaggle/input/cransfield/cran_queries.json"

In [None]:
def url_to_corpus(url,mode):
  
    """
    Given a JSON file containing a list of dictionaries with at least one key being mode,
    preprocesses and tokenizes the values associated with mode.

    Parameters:
    url (str): URL of the JSON file
    mode (str): key of the dictionary in the JSON file to preprocess and tokenize

    Returns:
    docs (np.array): array of preprocessed and tokenized documents
    types (set): set of unique tokens found in the documents
    """
    
    nltk.download('stopwords')
    nltk.download('punkt')
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    stop_words = stopwords.words('english')

    with open(url, 'r') as f: data = json.load(f)

    num_docs = len(data)
    docs = np.empty(num_docs, dtype='object')

    for i in range(num_docs): docs[i] = data[i][mode]

    # Preprocessing
    docs = [[[word for word in list(TextBlob(doc).words) if word not in stop_words] for doc in sent_detector.tokenize(d.strip())] for d in docs]

    # Appending sentences to a single token document
    merged_doc = []
    types = set()

    # Merge all sentences in one single token document
    for doc in docs:
        temp_doc = []
        for sentence in doc:
            temp_doc+=sentence
            for word in sentence:
                types.add(word)
        merged_doc.append(temp_doc)

    docs = merged_doc
    return docs,types


In [None]:
docs,types_docs = url_to_corpus(cranfield_docs_url,'body')
queries,types_queries = url_to_corpus(cranfield_queries_url,'query')

types = list(types_docs.union(types_queries))
#here types is union of two sets, 
#datatype of types is list -> although it is formed by union of two sets
#union operations here only considers the unique elements(not duplicated elements)
#finally (types) have have all the unique strings

In [None]:
max_seq_length = 0


for seq in docs:
    if len(seq) > max_seq_length : max_seq_length = len(seq)

for seq in queries:
    if len(seq) > max_seq_length : max_seq_length = len(seq)

#these two are added because we need to consider start and end that will be added in next step
max_seq_length += 2
print(max_seq_length)
#max_seq_length holds the the maximum size of the sequence that we encounter in docs and queries
#this is used to pad the sequences which are not similar to max_seq

def types_to_idx(types):
    #what this function d
    seq_idx = {}

    for t in types : seq_idx[t] = len(seq_idx)

    return seq_idx

seq_idx = types_to_idx(types)
#seq_idx a dict which maps the tokens to a number
#and below we added four special tokens
seq_idx['/start'] = len(seq_idx)
seq_idx['/end'] = len(seq_idx)
seq_idx['/unknown'] = len(seq_idx)
seq_idx['/pad'] = len(seq_idx)

#start,end,unknown,pad are mapped to 9235 9236 9237 9238 resply
print(seq_idx)

def doc_to_seq(docs, seq_idx, max_seq_length, mode):

    seqs = []
    
    for doc in docs:

        seq = []
        seq += doc
        
        if(mode=='pad') : 
            
            print("Done")
            seq.insert(0,'/start')
            seq.append('/end')
            while(len(seq)<max_seq_length) : seq.insert(-1,'/pad')
            #-1 inserts the element at last-but-one index

        seq = [seq_idx[word] for word in seq]
        seqs.append(seq)

    #seqs will consist of all the integers, this integers are mapping of the tokens to the id(number)    
    return seqs

#do padding stuff and then convert the list into numpy array
doc_seq = np.array(doc_to_seq(docs, seq_idx, max_seq_length, 'pad'))
query_seq = np.array(doc_to_seq(queries, seq_idx, max_seq_length, 'pad'))

print(doc_seq.shape)
print(query_seq.shape)

np.savetxt('cranfield_sequences/doc_seq.csv', doc_seq, fmt='%s')
np.savetxt('cranfield_sequences/q_seq.csv', query_seq, fmt='%s')

In [None]:
import tensorflow as tf
import numpy as np
import json
from tensorflow.keras.optimizers import Adam,SGD

with open("/kaggle/input/cransfield/cran_qrels.json", "r") as f:
    data = json.load(f)

doc_seq = np.genfromtxt('/kaggle/input/cransfield/doc_seq.csv', dtype=int)
#these are not embeddings

query_seq = np.genfromtxt('/kaggle/input/cransfield/q_seq.csv', dtype=int)
y = tf.one_hot(np.array([data[i]['position'] for i in range(len(data))]),depth=4)
x_doc_seq = np.array([doc_seq[int(data[i]['id'])-1] for i in range(len(data))])
x_query_seq = np.array([query_seq[int(data[i]['query_num'])-1] for i in range(len(data))])
