#  Information Retrieval Final Project
- TP1 : Descriptor and Inverse creation
- TP2 : Adding weights
- TP4 : Vector Space Models implementation
- TP5 : Probabilistic Model(BM25) implementation
- TP6 : Query validity
- TP7 : Boolean Model implementation
- TP8 : Metrics and Precision-Recall curve implementation


In [22]:
import tkinter as tk
from tkinter import ttk
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
import nltk 
from Data import loadCollection, loadQueries, loadJudgements
from InverseDescriptor import Tokenization, Normalization, indexing, Descriptor, Inverse, descriptorSearch, inverseSearch, weightsCompute, update
from Vector import VSM
from Probabilistic import BM25
from Boolean import isValidQuery, Boolean
from Evaluation import evaluation


# Create the main application window
root = tk.Tk()
root.title("Search Application")

root.geometry('1200x800')

# Function to perform the search
def perform_search():
    
    # Get the search term from the Entry widget
    search_term = entry.get()
    search_term_VSM = search_term.lower()
    
    # Get the selected options from the radio buttons
    matching_option = search_option0.get()
    tokenization_option = search_option.get()
    stemming_option = search_option2.get()
    vsmMeasure_option = choices.index(combo.get())
    
    if search_term == '':
        # Display results in the Text widget
        result_display.delete(1.0, tk.END)  # Clear previous results
        result_display.insert(tk.END, "Error: enter a query please.")
        return None
    
    # filtering
    stopW = nltk.corpus.stopwords.words('english')
    # load documents
    docs = loadCollection("LISA")

# REGEX #
    if tokenization_option == 1:
        # query treatement
        ExpReg = nltk.RegexpTokenizer('(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*')
        query = [term for term in ExpReg.tokenize(search_term_VSM)]

    # Porter Stemmer #
        if stemming_option == 1:
            # query treatement
            query = [nltk.PorterStemmer().stem(terme) for terme in query]
            # docs treatement
            descV0 = Descriptor(docs, 1, 1)
            invV0 = Inverse(docs, 1, 1)
            weights = weightsCompute(descV0, invV0)
            descriptor, inverse = update(descV0, invV0, weights)

    # Lancaster Stemmer #
        else:
            # query treatement
            query = [nltk.LancasterStemmer().stem(terme) for terme in query]
            # docs treatement
            descV0 = Descriptor(docs, 1, 0)
            invV0 = Inverse(docs, 1, 0)
            weights = weightsCompute(descV0, invV0)
            descriptor, inverse = update(descV0, invV0, weights)
# Split #

    else:
        # query treatement
        Texte = search_term_VSM.split()
        query = [term for term in Texte]

    # Porter Stemmer #
        if stemming_option == 1:
            # query treatement
            query = [nltk.PorterStemmer().stem(terme) for terme in query]
            # docs treatement
            descV0 = Descriptor(docs, 0, 1)
            invV0 = Inverse(docs, 0, 1)
            weights = weightsCompute(descV0, invV0)
            descriptor, inverse = update(descV0, invV0, weights)

    # Lancaster Stemmer #
        else:
            # query treatement
            query = [nltk.LancasterStemmer().stem(terme) for terme in query]
            # docs treatement
            descV0 = Descriptor(docs, 0, 0)
            invV0 = Inverse(docs, 0, 0)
            weights = weightsCompute(descV0, invV0)
            descriptor, inverse = update(descV0, invV0, weights)
    
    result_text2.set("")  # Clear previous results
    
# VECTOR SPACE MODEL #
    if matching_option == 1:
        query = [term for term in query if term not in stopW]
        result, relevant = VSM(descriptor, inverse, weights, vsmMeasure_option, query)
        
# PROBABILISTIC MODEL #
    elif matching_option == 2:
        query = [term for term in query if term not in stopW]
        if entry1.get() == '' or entry2.get() == '':
            result_display.delete(1.0, tk.END)  # Clear previous results
            result_display.insert(tk.END, "Error: Enter BM25 parameters.")
            return None
        K = float(entry1.get())
        B = float(entry2.get())
        result, relevant = BM25(descriptor, inverse, K, B, query)
        
# BOOLEAN MODEL #
    elif matching_option == 4:
        result, relevant = Boolean(descriptor, query)
        
    else:
        relevant = None
        # docs
        if search_term.isdigit():
            result = descriptorSearch(search_term, descriptor, inverse)
        # terms
        else:
            result = inverseSearch(search_term, stemming_option, inverse)
    
    # Display results in the Text widget
    result_display.delete(1.0, tk.END)  # Clear previous results
    result_display.insert(tk.END, result)
    
    if relevant:
        metrics, curve = evaluation(relevant, int(spinbox.get()))
        # displaying metrics
        metric_display = ""
        for key, value in metrics.items():
            metric_display += f"{key}: {value}\t\t"
        result_text2.set(f"{metric_display}")
        # displaying PR curve
        ax.clear()
        ax.plot(curve["recall"], curve["precision"], color='red')
        # add axis labels to plot
        ax.set_title('Precision-Recall Curve')
        ax.set_ylabel('Precision')
        ax.set_xlabel('Recall')
        # draw the plot on the Tkinter canvas
        canvas.draw()


def set_entry_text(*args):
    # get the value from the Spinbox
    spinbox_value_str = spinbox.get()
    
    # check if the Spinbox value is not empty
    if spinbox_value_str:
        index = int(spinbox_value_str)
        # load the sentence corresponding to the index
        queries = loadQueries("LISA.QUERIES")
        sentence = queries[index-1][1]
        # set the Entry text
        entry.delete(0, tk.END)
        entry.insert(0, sentence)

# Create a label next to the entry
label = ttk.Label(root, text="Query:")
label.pack()

# Create an Entry widget for user input
entry = ttk.Entry(root, text="Query", width=75)
entry.pack(pady=5)

# Create a "Search" Button
search_button = ttk.Button(root, text="Search", command=perform_search, style='TButton')
search_button.pack()

# Create a Spinbox for selecting the line number
spinbox_label = ttk.Label(root, text="Queries Dataset:")
spinbox_label.pack(pady=10)

# Use the StringVar to track the Spinbox value
spinbox_value = tk.StringVar()
spinbox = ttk.Spinbox(root, from_=1, to=100, width=5, textvariable=spinbox_value, command=set_entry_text)
spinbox.pack(pady=5)

# Set an initial value and call set_entry_text to update the Entry widget
spinbox_value.set("1")
set_entry_text()

# Create a label to display search results
result_text2 = tk.StringVar()
result_label2 = tk.Label(root, textvariable=result_text2, justify="left", anchor="w", font=("TkDefaultFont", 10, "bold"), wraplength=800)
result_label2.pack(side=tk.BOTTOM, padx=5)

# Create a PanedWindow to arrange the panes
paned_window = ttk.Panedwindow(root, orient="vertical")
paned_window.pack(expand=True, fill="both")

# Create a Frame for the radio buttons
radio_frame = ttk.Frame(paned_window)
paned_window.add(radio_frame)

# Create a LabelFrame for Matching models
matching_frame = ttk.LabelFrame(radio_frame, text="Matching")
matching_frame.pack(side="right", padx=10, pady=10)

# Create a set of radio buttons within the Matching frame
search_option0 = tk.IntVar()
radio = ttk.Radiobutton(matching_frame, text="Vector space model", variable=search_option0, value=1)
radio01 = ttk.Radiobutton(matching_frame, text="Probabilistic model (BM25)", variable=search_option0, value=2)
radio10 = ttk.Radiobutton(matching_frame, text="Data Mining Model", variable=search_option0, value=3)
radio11 = ttk.Radiobutton(matching_frame, text="Boolean Model", variable=search_option0, value=4)

radio.grid(row=0, column=0, padx=10, pady=5, sticky="w")
radio01.grid(row=0, column=1, padx=10, pady=5, sticky="w")
radio10.grid(row=3, column=1, padx=10, pady=5, sticky="w")
radio11.grid(row=3, column=0, padx=10, pady=5, sticky="w")


# Create and place the first Entry widget
label1 = tk.Label(matching_frame, text="K")
label1.grid(row=1, column=1, padx=50, pady=10, sticky="E")
entry1 = tk.Entry(matching_frame, width=10)
entry1.grid(row=1, column=1, padx=10, pady=10)

# Create and place the second Entry widget
label2 = tk.Label(matching_frame, text="B")
label2.grid(row=2, column=1, padx=50, pady=10, sticky="E")
entry2 = tk.Entry(matching_frame, width=10)
entry2.grid(row=2, column=1, padx=10, pady=10)

# Create a Combobox for the list of choices
choices = ['Scalar product', 'Cosine', 'Jaccard']  # Add your choices here
combo = ttk.Combobox(matching_frame, values=choices)
combo.set(choices[0])  # Set the default choice
combo.grid(row=1, column=0, padx=10, pady=5, sticky="w")

# Create a label to display the search result
result_label = ttk.Label(root, text="", style='TLabel')
result_label.pack()

# Create a LabelFrame for Tokenization
tokenization_frame = ttk.LabelFrame(radio_frame, text="Tokenization")
tokenization_frame.pack(side="left", padx=10, pady=10)

# Create a set of radio buttons within the Tokenization frame
search_option = tk.IntVar()
radio1 = ttk.Radiobutton(tokenization_frame, text="RegEx", variable=search_option, value=1)
radio2 = ttk.Radiobutton(tokenization_frame, text="Split", variable=search_option, value=2)
radio1.grid(row=0, column=0, padx=10, pady=5, sticky="w")
radio2.grid(row=1, column=0, padx=10, pady=5, sticky="w")

# Create a LabelFrame for Stemming
stemming_frame = ttk.LabelFrame(radio_frame, text="Stemming")
stemming_frame.pack(side="left", padx=10, pady=10)

# Create a set of radio buttons
search_option2 = tk.IntVar()
radio3 = ttk.Radiobutton(stemming_frame, text="Porter", variable=search_option2, value=1)
radio4 = ttk.Radiobutton(stemming_frame, text="Lancaster", variable=search_option2, value=2)
radio3.grid(row=0, column=0, padx=10, pady=5, sticky="w")
radio4.grid(row=1, column=0, padx=10, pady=5, sticky="w")

# Create a Text widget for displaying results
result_display = tk.Text(root, height=30, width=70)
result_display.pack(side=tk.LEFT)

# Create a Matplotlib figure and axis
fig = Figure(figsize=(5, 4), dpi=100)
ax = fig.add_subplot(111)

# Create a Tkinter Canvas widget that can be embedded in the main window
canvas = FigureCanvasTkAgg(fig, master=root)
canvas_widget = canvas.get_tk_widget()
canvas_widget.pack(side=tk.RIGHT, fill=tk.BOTH, expand=1)

# Start the main event loop
root.mainloop()

15846
('seu', {'6002': {'1': 1.8893}})
avdl 52.978847435043306
1 66
2 28
2 24
2 31
2 58
3 42
4 40
4 50
4 18
5 59
6 17
6 22
7 109
8 91
9 68
10 36
11 37
12 29
13 37
13 39
13 48
14 87
15 52
16 61
17 58
17 55
18 30
18 41
19 25
19 31
19 32
19 67
20 71
21 85
22 38
22 56
23 75
23 46
23 53
24 58
24 58
25 70
25 61
26 94
26 43
27 38
28 61
29 37
30 44
31 52
32 71
33 36
34 53
34 35
35 48
35 91
36 53
37 39
37 62
38 38
39 58
39 70
40 49
40 59
41 58
42 57
43 50
44 73
44 61
44 28
44 33
44 75
44 68
45 100
46 48
46 79
46 42
47 78
48 71
48 76
48 29
48 71
49 83
49 114
49 77
50 36
50 40
50 64
51 63
51 52
51 65
52 17
52 25
52 35
52 84
52 34
52 66
53 50
54 54
54 47
54 93
54 74
54 105
55 73
55 26
55 25
55 90
56 69
57 36
57 68
58 29
58 61
58 56
58 48
59 37
60 44
60 71
60 62
61 49
62 104
63 32
64 68
65 59
66 67
66 42
66 23
66 30
66 37
66 21
66 27
66 31
66 30
67 70
67 37
68 34
69 65
69 49
70 60
71 28
71 64
72 57
73 70
74 35
74 50
74 74
74 67
75 51
75 49
75 66
76 44
76 89
77 54
77 38
78 64
78 47
79 38
80 39
80 39