# importing libaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from pathlib import Path
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import math

In [2]:
#mounting google drive in order to import data
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **loading dataset** 

In [3]:
# param limiter => used to control number of training docs being loaded because of memory overload
limiter = 1000
def load_dataset():
  results = defaultdict(list)
  path = "/content/gdrive/MyDrive/data/HW04/documents/"
  i = 0
  for file in Path(path).iterdir():
    if i < limiter:       
      with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["text"].append(file_open.read())
      i = i+1
    
  return pd.DataFrame(results)


In [4]:

data = load_dataset()

In [5]:
#removing .txt extension
data['file_name'] = data['file_name'].apply(lambda name: name.split(".")[0])

In [6]:
data.shape

(1000, 2)

In [7]:

data.head()

Unnamed: 0,file_name,text
0,file_10215,noise measure of lossy tunnel diode amplifier ...
1,file_10861,the use of probing electrodes in the study of ...
2,file_10982,travelling pressure waves associated with geom...
3,file_10952,the australian national radio astronomy observ...
4,file_10269,cosmic ray phenomena during the november solar...


In [8]:

path1 = "/content/gdrive/MyDrive/data/HW04/file_label.txt"
labels = pd.read_csv(path1, names=["file_name", "label"], sep=",", header=None)
labels.head()

Unnamed: 0,file_name,label
0,file_0,0
1,file_1,0
2,file_2,0
3,file_3,0
4,file_4,0


In [9]:
#removing nulls
labels["label"].fillna(0, inplace=True)
labels["label"] = labels["label"].apply(lambda label: label if label != None else 0)
#concat labels to the documents dataframe
labels = labels.reindex(index=labels.index[::-1])

In [10]:

labels.head()

Unnamed: 0,file_name,label
11428,file_11428,0
11427,file_11427,0
11426,file_11426,0
11425,file_11425,0
11424,file_11424,0


In [11]:
# labels["label"][:limiter]
# data["labels"] = labels["label"][:limiter]
data["labels"] = ""
i = 1
for file_name in data["file_name"]:
  index = data[data["file_name"] == file_name].index.values[0]
  label = labels[labels["file_name"] == file_name]["label"]
  data["labels"][index] = label.values[0] if len(label) > 0 else 0
  # print(data["labels"][data.iloc[data["file_name"] == file_name].index])
# data["labels"] = labels["label"][:limiter]
data.head()

Unnamed: 0,file_name,text,labels
0,file_10215,noise measure of lossy tunnel diode amplifier ...,0
1,file_10861,the use of probing electrodes in the study of ...,0
2,file_10982,travelling pressure waves associated with geom...,0
3,file_10952,the australian national radio astronomy observ...,0
4,file_10269,cosmic ray phenomena during the november solar...,0


configs

*   only necessary in google colab



In [12]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

tokenization

In [13]:
def tokenize(word_list):
  tokenized_text = []
  for text in word_list:
    tokenized_text.append(word_tokenize(text))
  return tokenized_text
data["text"] = list(tokenize(data["text"]))
  

In [14]:
data.head()

Unnamed: 0,file_name,text,labels
0,file_10215,"[noise, measure, of, lossy, tunnel, diode, amp...",0
1,file_10861,"[the, use, of, probing, electrodes, in, the, s...",0
2,file_10982,"[travelling, pressure, waves, associated, with...",0
3,file_10952,"[the, australian, national, radio, astronomy, ...",0
4,file_10269,"[cosmic, ray, phenomena, during, the, november...",0


change case to lower case

In [15]:

data['text'] = data['text'].apply(lambda list: [txt.lower() for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_10215,"[noise, measure, of, lossy, tunnel, diode, amp...",0
1,file_10861,"[the, use, of, probing, electrodes, in, the, s...",0
2,file_10982,"[travelling, pressure, waves, associated, with...",0
3,file_10952,"[the, australian, national, radio, astronomy, ...",0
4,file_10269,"[cosmic, ray, phenomena, during, the, november...",0


stop word removal

In [16]:

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda list: [txt for txt in list if not txt in stop_words])
data.head()


Unnamed: 0,file_name,text,labels
0,file_10215,"[noise, measure, lossy, tunnel, diode, amplifi...",0
1,file_10861,"[use, probing, electrodes, study, ionosphere, ...",0
2,file_10982,"[travelling, pressure, waves, associated, geom...",0
3,file_10952,"[australian, national, radio, astronomy, obser...",0
4,file_10269,"[cosmic, ray, phenomena, november, solar, dist...",0


stemming

In [17]:
porter_stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda list: [porter_stemmer.stem(txt) for txt in list])
data.head()

Unnamed: 0,file_name,text,labels
0,file_10215,"[nois, measur, lossi, tunnel, diod, amplifi, s...",0
1,file_10861,"[use, probe, electrod, studi, ionospher, measu...",0
2,file_10982,"[travel, pressur, wave, associ, geomagnet, act...",0
3,file_10952,"[australian, nation, radio, astronomi, observa...",0
4,file_10269,"[cosmic, ray, phenomena, novemb, solar, distur...",0


### **generating an inverted index of terms**
here we are going to create a dictionary with all words as keys and the values are the doc file names containing the word
*   *NOTE: The get_inverted_index function is memory and cpu intensive so it may take some time*



In [18]:
#first we get all words
wordlist = [txt for list in data['text'] for txt in list]
#get only unique values using set
wordlist = set(wordlist)
wordlist = list(wordlist)
wordlist

['transient',
 'elast',
 'mz',
 'planar',
 'uncoupl',
 'thermospher',
 'al',
 'calcul',
 'tie',
 'suffici',
 'procedur',
 'archipelago',
 'dsb',
 'immedi',
 'unfocus',
 'concentr',
 'maxwel',
 'beta',
 'subtract',
 'labori',
 'power',
 'integ',
 'juli',
 'prebunch',
 'address',
 'zone',
 'discret',
 'upon',
 'tl',
 'circumst',
 'lf',
 'piec',
 'unidirect',
 'portion',
 'mention',
 'group',
 'multipli',
 'review',
 'express',
 'tempor',
 'magnon',
 'around',
 'withou',
 'form',
 'design',
 'receiv',
 'driven',
 'conjunct',
 'batteri',
 'striplin',
 'deriv',
 'commut',
 'inter',
 'adjoin',
 'take',
 'infin',
 'cathod',
 'binari',
 'whisper',
 'ultrason',
 'abl',
 'english',
 'durat',
 'magnetostrict',
 'similar',
 'possibl',
 'text',
 'follow',
 'demonstr',
 'amplifi',
 'seriou',
 'discover',
 'spheric',
 'potenti',
 'sponsor',
 'withstand',
 'lesser',
 'cluster',
 'invari',
 'thermodynam',
 'manufactur',
 'reflect',
 'incid',
 'maximum',
 'alkalin',
 'lead',
 'magneto',
 'hobart',
 'see

In [19]:
def get_inverted_index():
  word_dict = defaultdict(set)  
  for i, doc in enumerate(data['text']):
    for word in wordlist:
      if word in doc:
        word_dict[word].add(data.iloc[i, 0])
  return word_dict

word_dict = get_inverted_index()
# word_dict = pd.DataFrame(get_inverted_index())
# word_dict.head()
word_dict

defaultdict(set,
            {'calcul': {'file_10009',
              'file_10032',
              'file_10053',
              'file_10072',
              'file_10077',
              'file_10087',
              'file_10107',
              'file_10117',
              'file_10124',
              'file_10125',
              'file_10135',
              'file_10146',
              'file_10215',
              'file_10221',
              'file_10228',
              'file_10235',
              'file_10240',
              'file_10242',
              'file_10244',
              'file_10250',
              'file_10268',
              'file_10286',
              'file_10290',
              'file_10388',
              'file_10431',
              'file_10444',
              'file_10484',
              'file_10485',
              'file_10528',
              'file_10579',
              'file_10588',
              'file_10681',
              'file_10713',
              'file_10770',
              'file_1

# **modelling**

get inverse document frequency
*   idf = log(total docs/total docs containing term)



In [20]:
def get_idf(word):
  return math.log(data.shape[0] / (len(word_dict[word])))


get rsv ranking weights


1.   generate rsv of each word based on the whole training documents
2.   generate rsv of doc based on specific doc provided



In [21]:
def get_relevant_docs(docs):
  
  files = []
  for file_name in list(docs):
    if int(data[data['file_name'] == file_name]["labels"]) == 1:
      files.append( file_name )
  return files


def generate_rsv_weights():
  weights = {}
  for word in wordlist:
    terms_in_relevant_docs = len(get_relevant_docs(word_dict[word]))
    prob_of_word = terms_in_relevant_docs/(data.shape[0]+0.5)
    _log = math.log(prob_of_word/(1-prob_of_word)) if prob_of_word > 0 else 0
    weights[word] = get_idf(word) + _log
  return weights

weights = generate_rsv_weights()

In [22]:
weights

{'transient': 4.8283137373023015,
 'elast': 6.907755278982137,
 'mz': 6.907755278982137,
 'planar': 6.214608098422191,
 'uncoupl': 6.907755278982137,
 'thermospher': 6.907755278982137,
 'al': 5.809142990314028,
 'calcul': -4.093844437180419,
 'tie': 6.907755278982137,
 'suffici': 4.605170185988092,
 'procedur': 4.268697949366879,
 'archipelago': 6.907755278982137,
 'dsb': 6.907755278982137,
 'immedi': 6.907755278982137,
 'unfocus': 6.907755278982137,
 'concentr': 4.710530701645918,
 'maxwel': 6.907755278982137,
 'beta': 6.907755278982137,
 'subtract': 6.214608098422191,
 'labori': 6.907755278982137,
 'power': 3.101092789211817,
 'integ': 6.907755278982137,
 'juli': 5.298317366548036,
 'prebunch': 6.907755278982137,
 'address': 5.809142990314028,
 'zone': 4.710530701645918,
 'discret': 6.214608098422191,
 'upon': 6.907755278982137,
 'tl': 6.907755278982137,
 'circumst': 6.907755278982137,
 'lf': 6.907755278982137,
 'piec': 6.907755278982137,
 'unidirect': 6.907755278982137,
 'portion': 

getting total doc rsv score

In [23]:
#calculates the rsv of a document based on the query provided
def generate_doc_rsv(file_name, query):
  result = 0
  doc_list = data[data["file_name"] == file_name]["text"]
  for doc in doc_list:
    for word in doc:
      if word in query:
        result += weights[word]
  return result


rank docs by query provided

In [24]:

#retreive all docs containing our word
#get the rsv score of the retrieved docs
def rank_docs(query):
  results = {
      'file_name': [],
      'score': []
  }
  docs = list()
  for word in query:
    docs = [*docs, *list(word_dict[word])]
  for file_name in docs:
    results['file_name'].append(file_name)
    results['score'].append(generate_doc_rsv(file_name, query))
  results = pd.DataFrame(results).sort_values('score', ascending=False)
  return results


# search results/ prediction

In [25]:
#preprocess the query first then rank
def search_query(search_query, retrieved_docs = 10):
  #tokenize query
  query = list(tokenize([search_query]))[0]
  #lower case
  query = [txt.lower() for txt in query]
  #stop words
  query = [txt for txt in query if not txt in stop_words]
  #stemming
  query = [porter_stemmer.stem(txt) for txt in query]
  return rank_docs(query).head(retrieved_docs)

get the query

In [26]:

query_path = "/content/gdrive/MyDrive/data/HW04/query.txt"
query = pd.read_csv(query_path, names=["query"], sep=",", header=None)
query.head()

Unnamed: 0,query
0,secondary emission of electrons by positive io...


get first top 10 results

In [27]:
result = search_query(query["query"][0], 10)
result


Unnamed: 0,file_name,score
212,file_10965,7.459403
197,file_11118,7.459403
191,file_10261,7.459403
190,file_10328,7.459403
245,file_10869,6.214608
205,file_10048,3.729701
210,file_10786,3.729701
209,file_10012,3.729701
207,file_11236,3.729701
206,file_10353,3.729701
