In [13]:
import re
import pandas as pd
import ir_datasets
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import string
import pickle
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
dataset1 = ir_datasets.load('antique/train')
dataset2 = ir_datasets.load('lotte/lifestyle/dev/search')

In [15]:
df1 = pd.DataFrame(dataset1.docs_iter(), columns=['id', 'doc'])
df1.to_csv('antique.csv')

In [16]:
df2 = pd.DataFrame(dataset2.docs_iter(), columns=['id', 'doc'])
df2.to_csv('lotte.csv')

In [17]:
df1 = pd.read_csv('antique.csv')
df2 = pd.read_csv('lotte.csv')

In [18]:
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

In [19]:
shortcut = {
    'p.p.s':'post postscript',
    'u.s.a': 'united states of america',
    'a.k.a': 'also known as',
    'm.a.d': 'Mutually Assured Destruction',
    'a.b.b': 'Asea Brown Boveri',
    's.c.o': 'Santa Cruz Operation',
    'e.t.c': 'etcetera',
    'm.i.t': 'Massachusetts Institute of Technology',
    'v.i.p': 'very important person',
    'us':'united states of america',
    'u.s.':'united states of america',
    'usa':'united states of america',
    'cobol':'common business oriented language',
    'rpm':'red hat package manager',
    'ap':'associated press',
    'gpa':'grade point average',
    'npr':'national public radio',
    'fema':'federal emergency',
    'crt':'cathode ray tube',
    'gm':'grandmaster',
    'fps':'frames per second',
    'pc':'personal computer',
    'pms':'premenstrual syndrome',
    'cia':'central intelligence agency',
    'aids':'acquired immune deficiency syndrome',
    'it\'s':'it is',
    'you\'ve':'you have',
    'what\'s':'what is',
    'that\'s':'that is',
    'who\'s':'who is',
    'don\'t':'do not',
    'haven\'t':'have not',
    'there\'s':'there is',
    'i\'d':'i would',
    'it\'ll':'it will',
    'i\'m':'i am',
    'here\'s':'here is',
    'you\'ll':'you will',
    'cant\'t':'can not',
    'didn\'t':'did not',
    'hadn\'t':'had not',
    'kv':'kilovolt',
    'cc':'cubic centimeter',
    'aoa':'american osteopathic association',
    'rbi':'reserve bank',
    'pls':'please',
    'dvd':'digital versatile disc',
    'bdu':'boise state university',
    'dvd':'digital versatile disc',
    'mac':'macintosh',
    'tv':'television',
    'cs':'computer science',
    'cse':'computer science engineering',
    'iit':'indian institutes of technology',
    'uk':'united kingdom',
    'eee':'electrical and electronics engineering',
    'ca':'california',
    'etc':'etcetera',
    'ip':'internet protocol',
    'bjp':'bharatiya janata party',
    'gdp':' gross domestic product',
    'un':'unitednations',
    'ctc':'cost to company',
    'atm':'automated teller machine',
    'pvt':'private',
    'iim':'indian institutes of management'
    
    }

In [20]:
def expand_contractions(text, shortcut):
    contractions_pattern = re.compile('({})'.format('|'.join(re.escape(key) for key in shortcut.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = shortcut.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text

In [21]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [22]:
def preprocess(text):

  text = expand_contractions(text, shortcut)

  filtered_tokens = []
  for token in word_tokenize(text):
    token = re.sub(r'\b[0-9]+\b', '', token)
    token = token.translate(str.maketrans('', '', string.punctuation))
    token = token.lower()
    if len(token) > 0 and token not in stopwords:
      filtered_tokens.append(token)

  # lemmatization
  tagged_tokens = pos_tag(filtered_tokens)

  # Lemmatize based on POS tags
  lemmatized_words = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]
  processed_text = ' '.join(lemmatized_words)
  
  return processed_text


In [None]:
df1['doc'] = df1['doc'].apply(preprocess)

for col in df1.columns:
    if df1[col].dtype == 'float':
        df1[col] = df1[col].astype(str)

df1['doc'] = df1['doc'].fillna('')

df1['doc'] = df1['doc'].astype(str)

df1.to_csv('proccess_text.txt', index=False)

In [24]:
def save_object(obj, name):
  with open(f'{name}.pkl', 'wb') as file:
    pickle.dump(obj, file)

def load_object(name):
  with open(f'{name}.pkl', 'rb') as file:
    obj = pickle.load(file)
  return obj

In [25]:
def create_tfidf_index(df):
  vectorizer = TfidfVectorizer(preprocessor=preprocess)
  df = df.dropna(subset=['doc'])
  documents =  df['doc']
  tfidf_matrix = vectorizer.fit_transform(documents)
  return tfidf_matrix, vectorizer

In [None]:
tfidf_matrix_1, vectorizer_1 = create_tfidf_index(df1)
save_object(tfidf_matrix_1, 'tfidf_matrix_1')
save_object(vectorizer_1, 'vectorizer_1')

In [None]:
tfidf_matrix_2, vectorizer_2 = create_tfidf_index(df2)
save_object(tfidf_matrix_2, 'tfidf_matrix_2')
save_object(vectorizer_2, 'vectorizer_2')

In [28]:
tfidf_matrix_1 = load_object('tfidf_matrix_1')
vectorizer_1 = load_object('vectorizer_1')
tfidf_matrix_2 = load_object('tfidf_matrix_2')
vectorizer_2 = load_object('vectorizer_2')

In [29]:
query = 'I think Yuval is pretty spot on'

In [30]:
def search(query, dataset, tfidf_matrix, vectorizer, top_n=10):
  normalized_query = preprocess(query)
  query_vec = vectorizer.transform([normalized_query])
  cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
  most_similar_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]

  results = [0] * top_n
  docs_list = list(most_similar_docs_indices)
  for i, doc in enumerate(dataset.docs_iter()):
    if i in docs_list:
      results[docs_list.index(i)] = doc.doc_id

  return results

In [31]:
search(query, dataset1, tfidf_matrix_1, vectorizer_1)

['2020338_3',
 '4220683_6',
 '3500408_6',
 '3908585_1',
 '1471424_3',
 '3821699_3',
 '623875_1',
 '273713_7',
 '2075216_1',
 '3211948_1']

# Evaluation

In [32]:
def precission_at_10(relevant, retrieved):
    num_relevant_retrieved = len(set(relevant).intersection(retrieved))
    precision_at_10 = num_relevant_retrieved / 10
    return precision_at_10

In [33]:
def recall_values(relevant, retrieved):
    num_relevant_retrieved = len(relevant.intersection(retrieved))
    num_relevant_total = len(relevant)
    recall = num_relevant_retrieved / num_relevant_total
    return recall

In [34]:
def mean_avg_precision(relevant, retrieved):
     precision_sum = 0.0
     num_relevant = len(relevant)
     num_correct = 0
     for i, doc in enumerate(retrieved):
         if doc in relevant:
             num_correct += 1
             precision = num_correct / (i + 1)
             precision_sum += precision

     map  = precision_sum / num_relevant
     return map

In [36]:
def mean_reciprocal_rank(relevant, retrieved):
     rr = 0
     for i, doc in enumerate(retrieved):
         if doc in relevant:
             rr = 1/(i+1)
             break
     return rr

In [6]:
def getRelevance1(query_id, qrels_new):
    relevance1 = set()
    for doc in qrels_new.get(query_id):
        if(doc['relevance']==1):
            relevance1.add(doc['doc_id'])
    return relevance1

In [7]:
def getRetrievedDocs(retrieved):
    retrievedDocs = set()
    for doc in retrieved:
        retrievedDocs.add(doc['index'])
    return retrievedDocs

In [8]:
def calc_evaluation(qrels_new):
    AP = []
    MRR = []
    
    for query in __doc__:
        
        relevance1 = getRelevance1(query.query_id, qrels_new)
        retrieved = getRetrievedDocs(retrieved[query.query_id])
        #recall
        r = recall_values(relevance1, retrieved)
        #precission @ 10
        p = precission_at_10(relevance1, retrieved)
        with open('evaluation.txt', 'a') as f:
            f.write(f"{query.query_id}: precision@k:{p:.3f} recall:{r:.3f}\n")
        
        map = mean_avg_precision(relevance1, retrieved)
        AP.append(map)
        
        mrr = mean_reciprocal_rank(relevance1, retrieved)
        MRR.append(mrr)
    #MRR
    mean_MRR = sum(MRR) / len(MRR)
    #MAP
    MAP = sum(AP) / len(AP)
    with open('evaluation.txt', 'a') as f:
        f.write(f"{query.query_id}: MRR:{mean_MRR:.3f} MAP:{MAP:.3f}\n")