# Cranfield dataset retrieval

## Install dependencies and import libs

In [1]:
!pip install ir_datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.0/318.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.8/111.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.9/379.9 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for warc3-wet-clueweb09 (setup.py) ... [?25l[?25hdone
  Building wheel for zlib-state (setup.py) ... [?25l[?25hdone
  Building wheel for cbor (setup.py) ... [?25l[?25hdone


In [2]:
!pip install nltk --quiet

In [3]:
import ir_datasets

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

## Prepare preprocessor

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import math
from tqdm.notebook import tqdm

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return filtered_tokens

## Define model


In [7]:
import time
def measure_execution_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")
        return result
    return wrapper

### VSM - tfidf

In [8]:
class VSM_tfidf():

  def __init__(self):
    self.name = "Vector space model using TF-IDF"

  @measure_execution_time
  def fit(self, docs_p):
    df = {}           # doc frequency
    posting_list = {} # posting list (term : list of documents contain that term)
    tf = []           # term frequency in each doc

    docs = [d[1] for d in docs_p]
    self.doc_ids = [d[0] for d in docs_p]

    # init
    for doc in tqdm(docs):
      for term in doc:
        df[term] = 0
        posting_list[term] = []

    for i, doc in tqdm(enumerate(docs)):
      freq = {}
      for term in doc:
        if not term in freq:
          df[term] += 1
          posting_list[term].append(i)
        if term in freq:
          freq[term] += 1
        else:
          freq[term] = 1

      for term in freq:
        freq[term] /= len(doc)
        # normalize short / long doc by this

      tf.append(freq)

    idf = {}
    for term in posting_list:
      idf[term] = math.log2(len(tf) / df[term])

    self.tf = tf
    self.idf = idf
    return self

  def search(self, query):
    sims = [(self.doc_ids[index], self._sim(query, index)) for index in range(len(self.tf))]
    return sims

  def _sim(self, query, index):
    sim = 0.0

    # init query tf
    qtf = {}
    for term in query:
      qtf[term] = 0
    for term in query:
      qtf[term] += 1

    frequencies = self.tf[index]

    for term in query:
        if term not in frequencies:
            continue
        freq = frequencies[term]

        w_doc = freq  * self.idf[term]
        w_query = qtf[term] * self.idf[term]

        sim += w_doc * w_query
    return sim

### VSM - word2vec


In [9]:
class VSM_word2vec():

  def __init__(self, num = 100):
    self.name = "Vector space model using word2vec"
    self.w2v_model = None
    self.num = num

  def get_embedding_w2v(self, doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
      return np.zeros(self.num)
    else:
      for tok in doc_tokens:
        if tok in self.w2v_model.wv.key_to_index:
          embeddings.append(self.w2v_model.wv[tok])
        else:
          embeddings.append(np.random.rand(self.num))
      # mean the vectors of individual words to get the vector of the document
    return np.mean(embeddings, axis=0)

  @measure_execution_time
  def fit(self, docs_p):
    docs = [doc[1] for doc in docs_p]
    self.w2v_model = gensim.models.Word2Vec(docs, vector_size = self.num)
    self.vectors = [self.get_embedding_w2v(doc) for doc in tqdm(docs)]
    self.docs = [' '.join(doc) for doc in docs]
    self.doc_ids = [doc[0] for doc in docs_p]
    return self

  def search(self, query):
    sims = [(self.doc_ids[index], self._sim(query, index)[0][0]) for index in range(len(self.docs))]
    return sims

  def _sim(self, query, index):
    return cosine_similarity(self.get_embedding_w2v(query).reshape(1,-1), self.vectors[index].reshape(1,-1))

  def most_similar(self, query):
    sims = [(self.doc_ids[index], self._sim(query, index)[0][0], self.docs[index]) for index in range(len(self.docs))]
    return sorted(sims, key=lambda x : -x[1])

### VSM - fasttext

In [10]:
class VSM_FastText():

    def __init__(self, num=100):
        self.name = "Vector space model using FastText"
        self.ft_model = None
        self.num = num

    def get_embedding_ft(self, doc_tokens):
        embeddings = []
        if len(doc_tokens) < 1:
            return np.zeros(self.num)
        else:
            for tok in doc_tokens:
                embeddings.append(self.ft_model.wv[tok])
            # Mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

    @measure_execution_time
    def fit(self, docs_p):
        docs = [doc[1] for doc in docs_p]
        # Train the FastText model
        self.ft_model = gensim.models.FastText(docs, vector_size=self.num, window=5, min_count=1, sg=1)
        self.vectors = [self.get_embedding_ft(doc) for doc in tqdm(docs)]
        self.docs = [' '.join(doc) for doc in docs]
        self.doc_ids = [doc[0] for doc in docs_p]
        return self

    def search(self, query):
        sims = [(self.doc_ids[index], self._sim(query, index)[0][0]) for index in range(len(self.docs))]
        return sims

    def _sim(self, query, index):
        return cosine_similarity(self.get_embedding_ft(query).reshape(1, -1), self.vectors[index].reshape(1, -1))

    def most_similar(self, query):
        sims = [(self.doc_ids[index], self._sim(query, index)[0][0], self.docs[index]) for index in range(len(self.docs))]
        return sorted(sims, key=lambda x: -x[1])


## Prepare data

In [11]:
# dataset2 = ir_datasets.load("wikir/en1k/training")
dataset = ir_datasets.load("cranfield")
# dataset = ir_datasets.load("cord19/trec-covid/round1")

In [12]:
corpus = [(doc.doc_id, preprocess(doc.text)) for doc in tqdm(dataset.docs_iter())]

0it [00:00, ?it/s]

[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz

http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: 0.0%| 0.00/507k [00:00<?, ?B/s][A
http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: 66.3%| 336k/507k [00:00<00:00, 3.28MB/s][A
[A[INFO] [finished] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: [00:00] [507kB] [4.66MB/s]

http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: [00:00] [507kB] [4.39MB/s][A
                                                                                               [A

In [13]:
queries = [preprocess(query.text) for query in dataset.queries_iter()]
queries = [(str(i+1), x) for i, x in enumerate(queries)]

In [14]:
rels = {}
for qrel in dataset.qrels_iter():
  rels[qrel.query_id] = []
for qrel in dataset.qrels_iter():
  rels[qrel.query_id].append(qrel.doc_id)

## Train model

In [15]:
vsm_tfidf = VSM_tfidf()
vsm_tfidf.fit(corpus)


  0%|          | 0/1400 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Execution time: 0.3970065116882324 seconds


<__main__.VSM_tfidf at 0x797d58ce56f0>

In [16]:
vsm_w2v = VSM_word2vec(num=200)
vsm_w2v.fit(corpus)

  0%|          | 0/1400 [00:00<?, ?it/s]

Execution time: 1.8797054290771484 seconds


<__main__.VSM_word2vec at 0x797d58cbee30>

In [17]:
vsm_ft = VSM_FastText(num=200)
vsm_ft.fit(corpus)

  0%|          | 0/1400 [00:00<?, ?it/s]

Execution time: 31.882296800613403 seconds


<__main__.VSM_FastText at 0x797d58ce6770>

In [18]:
queries[0][1]

['similarity',
 'law',
 'must',
 'obeyed',
 'constructing',
 'aeroelastic',
 'model',
 'heated',
 'high',
 'speed',
 'aircraft',
 '.']

In [19]:
rels['1'][:10]

['184', '29', '31', '12', '51', '102', '13', '14', '15', '57']

## Query

In [20]:
def infer(model):
  res = {}
  search_time = 0
  for id, query in tqdm(queries):
    begin_time = time.time()
    myDocScore = model.search(query)
    search_time += time.time() - begin_time
    sorted_score = sorted(myDocScore, key=lambda item: -item[1])
    res[id] = sorted_score

  print('')
  print(search_time)
  return res

In [21]:
res_vsm_tfidf = infer(vsm_tfidf)

  0%|          | 0/225 [00:00<?, ?it/s]


5.077363967895508


In [22]:
res_vsm_w2v = infer(vsm_w2v)

  0%|          | 0/225 [00:00<?, ?it/s]


118.98234415054321


In [23]:
res_vsm_ft = infer(vsm_ft)

  0%|          | 0/225 [00:00<?, ?it/s]


129.76340508460999


## Evaluate

In [24]:
def calculate_interpolated_map(queries, relevant_docs):
    sum=0
    lens=len(queries.keys())
    for query in queries.keys():
        ranked_docs = queries[query]
        precision = []
        recall = []
        relevant = set(relevant_docs[query])
        retrieved = set()
        for i, doc in enumerate(ranked_docs):
            if doc[0] in relevant:
                retrieved.add(doc[0])
            precision.append(len(retrieved) / (i + 1))
            recall.append(len(retrieved) / len(relevant))
        precision_sum=0
        for i in range(11):
            recall_level=i/10
            precision_list=[precision[i] for i in range(len(precision)) if recall[i] >= recall_level]
            if(len(precision_list)==0):
                precision_list=0
            else:
              precision_list=max(precision_list)
            precision_sum+=(precision_list/11)
        sum+=(precision_sum/lens)

    return sum

In [25]:
calculate_interpolated_map(res_vsm_tfidf, rels)

0.3408518868633598

In [26]:
calculate_interpolated_map(res_vsm_w2v, rels)

0.045335861045249067

In [27]:
calculate_interpolated_map(res_vsm_ft, rels)

0.14759197885877143

# Text classification - Spam or Ham

## Init dataset

In [28]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

In [29]:
from datasets import load_dataset

dataset = load_dataset("sms_spam")

Downloading builder script:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gensim.downloader as api

In [31]:
df = pd.DataFrame(dataset['train'])

In [32]:
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [33]:
df.describe()

Unnamed: 0,label
count,5574.0
mean,0.134015
std,0.340699
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [34]:
# Split the data into training and testing sets
X = df['sms']  # SMS text
y = df['label']  # Spam or ham labels

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tokenized = [preprocess(text) for text in X_train]
X_test_tokenized = [preprocess(text) for text in X_test]

## TF-IDF Vectorizer


In [35]:
@measure_execution_time
def tfidf_classify():
  # TF-IDF vectorization
  tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
  X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
  X_test_tfidf = tfidf_vectorizer.transform(X_test)

  # Train a logistic regression classifier
  logistic_classifier = LogisticRegression()
  logistic_classifier.fit(X_train_tfidf, y_train)

  # Predict on the test set
  y_pred = logistic_classifier.predict(X_test_tfidf)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  confusion = confusion_matrix(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)

  # Print the evaluation metrics
  print("Accuracy:", accuracy)
  print("\nConfusion Matrix:\n", confusion)
  print("\nClassification Report:\n", classification_rep)

In [36]:
tfidf_classify()

Accuracy: 0.9721973094170404

Confusion Matrix:
 [[953   1]
 [ 30 131]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       954
           1       0.99      0.81      0.89       161

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Execution time: 0.19713139533996582 seconds


## Word2vec

In [37]:
@measure_execution_time
def w2v_classify():
  num_features = 300  # Size of Word2Vec word vectors (adjust as needed)
  window_size = 5  # Context window size (adjust as needed)
  min_word_count = 1  # Minimum word count (adjust as needed)
  sg = 1  # Skip-gram model (you can use CBOW by setting sg=0)

  word2vec_model = gensim.models.Word2Vec(
      X_train_tokenized,
      vector_size=num_features,
      window=window_size,
      min_count=min_word_count,
      sg=sg
  )

  # Convert text data to Word2Vec embeddings
  def average_word_vectors(tokenized_text, model, num_features):
      feature_vector = np.zeros((num_features,), dtype="float32")
      n_words = 0
      for word in tokenized_text:
          if word in model.wv.key_to_index:
              feature_vector = np.add(feature_vector, model.wv[word])
              n_words += 1
      if n_words > 0:
          feature_vector = np.divide(feature_vector, n_words)
      return feature_vector

  X_train_w2v = [average_word_vectors(text, word2vec_model, num_features) for text in X_train_tokenized]
  X_test_w2v = [average_word_vectors(text, word2vec_model, num_features) for text in X_test_tokenized]

  # Train a logistic regression classifier
  logistic_classifier_w2v = LogisticRegression()
  logistic_classifier_w2v.fit(X_train_w2v, y_train)

  # Predict on the test set
  y_pred_w2v = logistic_classifier_w2v.predict(X_test_w2v)

  # Evaluate the Word2Vec model
  accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
  confusion_w2v = confusion_matrix(y_test, y_pred_w2v)
  classification_rep_w2v = classification_report(y_test, y_pred_w2v)

  # Print the evaluation metrics for the Word2Vec model
  print("Word2Vec Model:")
  print("Accuracy:", accuracy_w2v)
  print("\nConfusion Matrix:\n", confusion_w2v)
  print("\nClassification Report:\n", classification_rep_w2v)

In [38]:
w2v_classify()

Word2Vec Model:
Accuracy: 0.9434977578475336

Confusion Matrix:
 [[941  13]
 [ 50 111]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       954
           1       0.90      0.69      0.78       161

    accuracy                           0.94      1115
   macro avg       0.92      0.84      0.87      1115
weighted avg       0.94      0.94      0.94      1115

Execution time: 2.5660758018493652 seconds


## Fasttext

In [39]:
@measure_execution_time
def ft_classify():
  num_features = 300  # Size of FastText word vectors (adjust as needed)
  window_size = 5  # Context window size (adjust as needed)
  min_word_count = 1  # Minimum word count (adjust as needed)
  sg = 1  # Skip-gram model (you can use CBOW by setting sg=0)

  fasttext_model = gensim.models.FastText(
      sentences=X_train_tokenized,
      vector_size=num_features,
      window=window_size,
      min_count=min_word_count,
      sg=sg
  )
  # Convert text data to FastText embeddings
  def average_word_vectors(tokenized_text, model, num_features):
      feature_vector = np.zeros((num_features,), dtype="float32")
      n_words = 0
      for word in tokenized_text:
          if word in model.wv.key_to_index:
              feature_vector = np.add(feature_vector, model.wv[word])
              n_words += 1
      if n_words > 0:
          feature_vector = np.divide(feature_vector, n_words)
      return feature_vector

  X_train_ft = [average_word_vectors(text, fasttext_model, num_features) for text in X_train_tokenized]
  X_test_ft = [average_word_vectors(text, fasttext_model, num_features) for text in X_test_tokenized]

  # Train a logistic regression classifier
  logistic_classifier_ft = LogisticRegression()
  logistic_classifier_ft.fit(X_train_ft, y_train)

  # Predict on the test set
  y_pred_ft = logistic_classifier_ft.predict(X_test_ft)

  # Evaluate the FastText model
  accuracy_ft = accuracy_score(y_test, y_pred_ft)
  confusion_ft = confusion_matrix(y_test, y_pred_ft)
  classification_rep_ft = classification_report(y_test, y_pred_ft)

  # Print the evaluation metrics for the FastText model
  print("FastText Model:")
  print("Accuracy:", accuracy_ft)
  print("\nConfusion Matrix:\n", confusion_ft)
  print("\nClassification Report:\n", classification_rep_ft)

In [40]:
ft_classify()

FastText Model:
Accuracy: 0.9524663677130045

Confusion Matrix:
 [[937  17]
 [ 36 125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       954
           1       0.88      0.78      0.83       161

    accuracy                           0.95      1115
   macro avg       0.92      0.88      0.90      1115
weighted avg       0.95      0.95      0.95      1115

Execution time: 20.35015368461609 seconds
