In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
pd.set_option("display.max_colwidth", None)

# Model 1: looking for similar texts

In [4]:
df = pd.read_csv("data.csv")
df = df[["code", "description/en", "meta/lastUpdated"]].rename(columns={"code": "title", "description/en": "description", "meta/lastUpdated": "date"})
df["combined_text"] = df.title + " " + df.description.fillna("")
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,title,description,date,combined_text
0,AD-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23,"AD-COA A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about"
1,AE-ACCI,There are 2 types of search available: Commercial search and Industrial search. You can also use an online enquiry form to find the required information. The contact details of Ajman Chamber and the heads of its sectors are also available.,2017-11-16,AE-ACCI There are 2 types of search available: Commercial search and Industrial search. You can also use an online enquiry form to find the required information. The contact details of Ajman Chamber and the heads of its sectors are also available.
2,AE-ADCD,"This register includes information on companies' Unified no., Membership no., name, address, phone number, email, activity etc.",2017-11-16,"AE-ADCD This register includes information on companies' Unified no., Membership no., name, address, phone number, email, activity etc."
3,AE-AFZ,"The authority's website can be used to find its address, email and call center contacts. No clear search functionality directly on the website.",2017-11-16,"AE-AFZ The authority's website can be used to find its address, email and call center contacts. No clear search functionality directly on the website."
4,AE-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23,"AE-COA A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about"


In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df.combined_text)

pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()).head(5)

Unnamed: 0,000,0001,00074,001,01,02,03,05,0614,092eab90,...,корхоналар,матични,ташкилотларнинг,ягона,الرقم,الضرائب,بيانات,تسجيل,دافعي,กระทรวงมหาดไทย
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
input_text = ""
input_vector = vectorizer.transform([input_text])

if input_text == "":
    similarity_matrix = cosine_similarity(X=tfidf_matrix)
else:
    similarity_matrix = cosine_similarity(X=input_vector, Y=tfidf_matrix)

best_match_index = np.argmax(similarity_matrix)
top_indices = np.argsort(similarity_matrix[0])[::-1][:5]
df.iloc[top_indices][["title", "description", "date"]]

Unnamed: 0,title,description,date
0,AD-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23
136,DO-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23
252,IS-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23
245,IN-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23
237,IE-COA,"A government's budget (or 'Chart of Accounts') often refers to government agencies, departments and ministries with stable codes. These can be reliably used in open data publications as identifiers.\n\nThis org-id.guide entry is generated and maintained by Gov Org ID Finder [1] from Development Initiatives. Where available, Gov Org ID Finder extracts and makes Chart of Accounts codes available for the convenience of users. The authoritative source remains the government's budget or Chart of Accounts.\n\n[1] https://gov-id-finder.codeforiati.org/about",2022-03-23


# Model 2: searching for sequences - Markov chain

In [7]:
data = {
    'title': ['Event 1', 'Event 2', "test", 'Event 3', 'Event 4', "test", 'Event 1', 'Event 2', "test", 'Event 3', 'Event 4'],
    'description': ['Description of event 1', 'Description of event 2', "test", 'Description of event 3', 'Description of event 4', "test", 'Description of event 1', 'Description of event 2', "test", 'Description of event 3', 'Description of event 4'],
    'date': ['2025-02-01', '2025-02-02', "test", '2025-02-03', '2025-02-04', "test", '2025-02-01', '2025-02-02', "2021-01-01", '2025-02-03', '2025-02-04']
}

# data = {
#     'title': ['Event 1', 'Event 2', 'Event 3', 'Event 4', 'Event 1', 'Event 2', 'Event 3'],
#     'description': ['Description of event 1', 'Description of event 2', 'Description of event 3', 'Description of event 4', 'Description of event 1', 'Description of event 2','Description of event 3'],
#     'date': ['2025-02-01', '2025-02-02', '2025-02-04', '2025-02-01', '2025-02-02', "2021-01-01", '2025-02-03']
# }

df_2 = pd.DataFrame(data)
df_2["combined_text"] = df_2.title + " " + df_2.description

df_2['next_combined_text'] = df_2['combined_text'].shift(-1)

transitions = pd.crosstab(df_2['combined_text'], df_2['next_combined_text'], dropna=False)
transition_probabilities = transitions.div(transitions.sum(axis=1), axis=0).fillna(0)

last_event = df_2['combined_text'].iloc[-1]
probabilities = transition_probabilities.loc[last_event]


predicted_combined_text = probabilities.idxmax()
predicted_probability = probabilities.max()


print(predicted_probability, predicted_combined_text, probabilities, end="\n\n")

if not pd.isna(predicted_combined_text):
    predicted_row = df_2[df_2['combined_text'] == predicted_combined_text].iloc[0]
    print(predicted_row)


0.5 test test next_combined_text
Event 1 Description of event 1    0.0
Event 2 Description of event 2    0.0
Event 3 Description of event 3    0.0
Event 4 Description of event 4    0.0
test test                         0.5
NaN                               0.5
Name: Event 4 Description of event 4, dtype: float64

title                                           test
description                                     test
date                                            test
combined_text                              test test
next_combined_text    Event 3 Description of event 3
Name: 2, dtype: object
