In [None]:
%%capture
!pip install pyserini
!pip install dice-ml
!pip install faiss-cpu --no-cache
!pip install lightgbm
!pip install psutil
!python -m spacy download en_core_web_sm

In [None]:
import numpy as np
import pandas as pd
import torch
# Sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#Pyserini imports
from pyserini.search import LuceneSearcher, get_topics
from pyserini.search.lucene import LuceneImpactSearcher
# DiCE imports
import dice_ml
from dice_ml import Dice
from dice_ml.utils import helpers  # helper functions



In [None]:
Lsearcher = LuceneSearcher.from_prebuilt_index('msmarco-passage')
LIsearcher = LuceneImpactSearcher.from_prebuilt_index('msmarco-v1-passage-unicoil','castorini/unicoil-msmarco-passage')

Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/index-msmarco-passage-20201117-f87c94.tar.gz...


index-msmarco-passage-20201117-f87c94.tar.gz: 2.07GB [00:51, 43.1MB/s]                            


Attempting to initialize pre-built index msmarco-v1-passage-unicoil.
Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.tar.gz...


lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.tar.gz: 1.08GB [00:33, 34.9MB/s]                            


Extracting /root/.cache/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.tar.gz into /root/.cache/pyserini/indexes/lucene-index.msmarco-v1-passage-unicoil.20221005.252b5e.29521fa94165e87caaaddcb5b0d37b13...
Initializing msmarco-v1-passage-unicoil...


Downloading (…)lve/main/config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

In [None]:
def cfeir(searcher,searcher2,query, k):
  hits = searcher.search(query)
  fetchs = searcher2.search(query)
  # Extract the raw document texts
  document_texts = [searcher.doc(hit.docid).raw() for hit in fetchs]
  # Extract the top k words from the top k documents and add to the union set
  for hit in fetchs[:k]:
      # doc_text = searcher.doc(hit.docid).raw()  # Get the raw document text
      # doc_tokens = doc_text.split()  # Split document into tokens
      doc_scores = hit.score
  # Create a TF-IDF vectorizer
  vectorizer = TfidfVectorizer(max_features=10000, token_pattern=r'\b[A-Za-z]+\b')
  # Compute the TF-IDF matrix
  tfidf_matrix = vectorizer.fit_transform(document_texts)
  # Get the feature names (words) from the vectorizer
  feature_names = vectorizer.get_feature_names_out()
  # Get the indices of top 20 TF-IDF words from all the documents
  top_words_indices = np.argsort(-tfidf_matrix.toarray(), axis=1)[:, :20]
  # Flatten the array of top 20 TF-IDF words from all the documents into a single list
  flattened_list = [num for sublist in top_words_indices for num in sublist]
  # Sort the flattened list in descending order
  flattened_list.sort(reverse=True)
  # Select the top 30 numbers
  top_30 = flattened_list[:30]
  # Extract the top 30 words with high TF-IDF values
  unique_words_set = set()
  for ids in top_30:
      unique_words_set.update([feature_names[ids]])

  # Convert the set of unique words to a list
  unique_words_list = list(unique_words_set)

  # Create a matrix to store the document vectors
  document_vectors = np.zeros((k, len(unique_words_list)))

  # Fill the document vectors with word frequencies
  for i, hit in enumerate(fetchs[:k]):
      doc_text = searcher.doc(hit.docid).raw()  # Get the raw document text
      doc_tokens = doc_text.split()  # Split document into tokens
      word_freq = {}  # To store the term frequencies in the document
      for token in doc_tokens:
          if token in unique_words_list:
              if token in word_freq:
                  word_freq[token] += 1
              else:
                  word_freq[token] = 1
      for j, word in enumerate(unique_words_list):
          if word in word_freq:
              document_vectors[i, j] = word_freq[word]

  # Number of documents to retrieve
  def retrieve_documents(query, k=k):
      hits = searcher.search(query, k)
      documents = [(hit.docid, hit.raw, hit.score) for hit in fetchs]
      return documents
  documents = retrieve_documents(query, k)

  # Score threshold for classification
  score_threshold = np.mean([hit.score for hit in fetchs])

  # Classify documents based on the score
  classified_documents = []
  for doc_id, _, doc_score in documents:
      classification = 1 if doc_score > score_threshold else 0
      classified_documents.append((doc_id, _, doc_score, classification))

  # Create a pandas DataFrame from document vectors and add the "class" column
  document_df = pd.DataFrame(document_vectors, columns=unique_words_list)
  class_column = [classification for _, _, _, classification in classified_documents]
  document_df["class"] = class_column
  # # Print the classified documents
  for doc_id, doc_text, doc_score, classification in classified_documents:
    print(f"\nDocument ID: {doc_id} Document score: {doc_score} Classification: {classification}")

  target = document_df["class"]
  datasetX = document_df.drop('class', axis=1)

  x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                      target,
                                                      test_size=0.2,
                                                      random_state=0,
                                                      stratify=target)

  continuous_features_ = document_df.drop("class", axis=1).columns.tolist()
  categorical_features = x_train.columns.difference(continuous_features_)

  # We create the preprocessing pipelines for both numeric and categorical data.
  numeric_transformer = Pipeline(steps=[
      ('scaler', StandardScaler())])

  categorical_transformer = Pipeline(steps=[
      ('onehot', OneHotEncoder(handle_unknown='ignore'))])

  transformations = ColumnTransformer(
      transformers=[
          ('num', numeric_transformer, continuous_features_),
          ('cat', categorical_transformer, categorical_features)])

  # Append classifier to preprocessing pipeline.
  # Now we have a full prediction pipeline.
  clf = Pipeline(steps=[('preprocessor', transformations),
                            ('classifier', RandomForestClassifier())])
  model = clf.fit(x_train, y_train)

  d = dice_ml.Data(dataframe=document_df,
                        continuous_features=continuous_features_,
                        outcome_name="class")

  # We provide the type of model as a parameter (model_type)
  m = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')

  return d,m,datasetX

In [None]:
query = "feeding rice cereal how many times per day"
d,m,datasetX= cfeir(searcher = searcher, searcher2 = searcher, query=query, k =10)
exp_genetic = Dice(d, m, method="genetic")
TL_Model = exp_genetic.generate_counterfactuals(datasetX[4:5], total_CFs=2, desired_class=1)

TL_Model.visualize_as_dataframe()


Document ID: 1159 Document score: 16.748355865478516 Classification: 1

Document ID: 8307809 Document score: 15.947338104248047 Classification: 1

Document ID: 1158 Document score: 14.88834285736084 Classification: 1

Document ID: 6999562 Document score: 14.44862174987793 Classification: 0

Document ID: 4500095 Document score: 14.282904624938965 Classification: 0

Document ID: 7777888 Document score: 13.819817543029785 Classification: 0

Document ID: 7257982 Document score: 13.796950340270996 Classification: 0

Document ID: 5858596 Document score: 13.746607780456543 Classification: 0

Document ID: 6835062 Document score: 13.687074661254883 Classification: 0

Document ID: 7772792 Document score: 13.667708396911621 Classification: 0


100%|██████████| 1/1 [00:06<00:00,  6.40s/it]

Query instance (original outcome : 0)





Unnamed: 0,watch,way,use,tsp,wks,try,the,typically,that,variety,...,this,to,vegetable,your,weevils,well,times,you,was,class
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,watch,way,use,tsp,wks,try,the,typically,that,variety,...,this,to,vegetable,your,weevils,well,times,you,was,class
0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,2.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [None]:
d,m,datasetX= cfeir(searcher = Lsearcher, searcher2 = LIsearcher, query=query, k = 10)
exp_genetic = Dice(d, m, method="genetic")
LSR_Model = exp_genetic.generate_counterfactuals(datasetX[7:8], total_CFs=2, desired_class=1)
LSR_Model.visualize_as_dataframe()


Document ID: 1160 Document score: 1882.0914306640625 Classification: 1

Document ID: 1159 Document score: 1711.84521484375 Classification: 1

Document ID: 3055042 Document score: 1563.91357421875 Classification: 1

Document ID: 8307809 Document score: 1561.8851318359375 Classification: 0

Document ID: 2176410 Document score: 1547.8985595703125 Classification: 0

Document ID: 1161 Document score: 1513.944091796875 Classification: 0

Document ID: 4500093 Document score: 1505.284912109375 Classification: 0

Document ID: 4810216 Document score: 1461.0523681640625 Classification: 0

Document ID: 7772792 Document score: 1460.411865234375 Classification: 0

Document ID: 6034490 Document score: 1427.7357177734375 Classification: 0


100%|██████████| 1/1 [00:00<00:00,  1.03it/s]

Query instance (original outcome : 0)





Unnamed: 0,when,watch,way,wants,year,water,tsp,try,want,worse,...,work,your,we,times,well,too,totals,usually,veg,class
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,when,watch,way,wants,year,water,tsp,try,want,worse,...,work,your,we,times,well,too,totals,usually,veg,class
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
