<a href="https://colab.research.google.com/github/arkistar4uu/Python_Projects/blob/main/T_D_MATRIX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from collections import defaultdict
from math import log

def my_TD_matrix(d1, d2, d3):
    # Combine all the documents into a single list
    documents = [d1, d2, d3]

    # Create a dictionary to store the document frequency of each term
    df = defaultdict(int)

    # Iterate over all the documents and count the document frequency of each term
    for doc in documents:
        for term in set(doc):
            df[term] += 1

    # Create a list of all the unique terms in the document collection
    terms = sorted(list(df.keys()))

    # Create a dictionary to store the T-D matrix
    td_matrix = defaultdict(dict)

    # Iterate over all the documents and terms and compute the T-D matrix
    for i, doc in enumerate(documents):
        for term in set(doc):
            # Compute the term frequency of the term in the document
            tf = doc.count(term)

            # Compute the inverse document frequency of the term
            idf = 1 + log(len(documents) / df[term])

            # Compute the weighted term frequency of the term in the document
            wtf = (1 + log(tf)) * idf

            # Add the weighted term frequency of the term in the document to the T-D matrix
            td_matrix[term][f'd{i+1}'] = wtf

    # Convert the dictionary to a matrix and return it
    return [[td_matrix[term][f'd{i+1}'] if f'd{i+1}' in td_matrix[term] else 0 for i in range(len(documents))] for term in terms]

# Example usage
d1 = ["python", "anaconda", "jupyter", "programming", "learn", "computer"]
d2 = ["game", "are", "world", "programming", "learn", "learn", "are"]
d3 = ["learn", "are", "world", "programming"]

td_matrix = my_TD_matrix(d1, d2, d3)
print(td_matrix)


[[2.09861228866811, 0, 0], [0, 2.3796592851687173, 1.4054651081081644], [2.09861228866811, 0, 0], [0, 2.09861228866811, 0], [2.09861228866811, 0, 0], [1.0, 1.6931471805599454, 1.0], [1.0, 1.0, 1.0], [2.09861228866811, 0, 0], [0, 1.4054651081081644, 1.4054651081081644]]


In [3]:
from math import log

def my_TD_matrix(d1, d2, d3):
    documents = [d1, d2, d3]
    terms = set(d1 + d2 + d3)
    td_matrix = {}
    
    for term in terms:
        td_matrix[term] = [0] * 3
        for i in range(3):
            doc = documents[i]
            # Count the number of times the term appears in the document
            tf = doc.count(term)
            td_matrix[term][i] = tf
            if tf > 0:
                # Compute the inverse document frequency of the term
                idf = 1 + log(len(documents) / sum(1 for doc in documents if term in doc))
                # Compute the weighted term frequency of the term in the document
                td_matrix[term][i] *= idf
    
    return td_matrix

d1 = ["python", "anaconda", "jupyter", "programming", "learn", "computer"]
d2 = ["game", "are", "world", "programming", "learn", "learn", "are"]
d3 = ["learn", "are", "world", "programming"]

td_matrix = my_TD_matrix(d1, d2, d3)
print(td_matrix)


{'programming': [1.0, 1.0, 1.0], 'computer': [2.09861228866811, 0, 0], 'game': [0, 2.09861228866811, 0], 'python': [2.09861228866811, 0, 0], 'world': [0, 1.4054651081081644, 1.4054651081081644], 'learn': [1.0, 2.0, 1.0], 'jupyter': [2.09861228866811, 0, 0], 'are': [0, 2.8109302162163288, 1.4054651081081644], 'anaconda': [2.09861228866811, 0, 0]}


In [4]:
from math import log

def my_TD_matrix(d1, d2, d3):
    documents = [d1, d2, d3]
    vocabulary = list(set(d1 + d2 + d3))
    td_matrix = []
    
    for term in vocabulary:
        tf_vector = []
        for document in documents:
            # Compute the term frequency of the term in the document
            tf = document.count(term)
            
            # Append the term frequency to the term frequency vector
            tf_vector.append(tf)
            
        # Compute the document frequency of the term
        df = sum([1 for document in documents if term in document])
        
        # Compute the inverse document frequency of the term
        idf = 1 + log(len(documents) / df)
        
        # Compute the weighted term frequency vector of the term
        wtf_vector = [tf * idf for tf in tf_vector]
        
        # Append the weighted term frequency vector to the T-D matrix
        td_matrix.append(wtf_vector)
        
    return td_matrix

def query_documents(documents, td_matrix, query):
    query_terms = query.split()
    relevant_documents = []
    
    for i in range(len(documents)):
        document = documents[i]
        relevant = True
        
        for term in query_terms:
            if term not in document:
                relevant = False
                break
                
        if relevant:
            relevant_documents.append(i+1)
            
    return relevant_documents

d1 = ["python", "anaconda", "jupyter", "programming", "learn", "computer"]
d2 = ["game", "are", "world", "programming", "learn", "learn", "are"]
d3 = ["learn", "are", "world", "programming"]

td_matrix = my_TD_matrix(d1, d2, d3)
print(td_matrix)

query = "python learn"
relevant_documents = query_documents([d1, d2, d3], td_matrix, query)
print(f"The documents containing the terms '{query}' are: {relevant_documents}")


[[1.0, 1.0, 1.0], [2.09861228866811, 0.0, 0.0], [0.0, 2.09861228866811, 0.0], [2.09861228866811, 0.0, 0.0], [0.0, 1.4054651081081644, 1.4054651081081644], [1.0, 2.0, 1.0], [2.09861228866811, 0.0, 0.0], [0.0, 2.8109302162163288, 1.4054651081081644], [2.09861228866811, 0.0, 0.0]]
The documents containing the terms 'python learn' are: [1]
