In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [2]:
df_temp_train = pd.read_json("data/train.json")
dict = df_temp_train.to_dict()

df_test = pd.read_csv("data/test_shuffle.txt", sep='\t',names=['sentences'])

In [4]:
liste = []
liste_label = []
for elem in dict.keys():
    for i in dict[elem].keys():
        liste.append(dict[elem][i])
        liste_label.append(elem)
        
df_train = pd.DataFrame({"sentences":liste,"labels":liste_label})
df_train.head()

Unnamed: 0,sentences,labels
0,The mayor announced a new initiative to improv...,Politics
1,The senator is facing criticism for her stance...,Politics
2,The upcoming election has sparked intense deba...,Politics
3,Regular exercise and a balanced diet are key t...,Health
4,The World Health Organization has issued new g...,Health


### tf-idf knn

In [15]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_train['sentences'])

In [16]:
query_vec = vectorizer.transform(df_test.loc[:,"sentences"])
results = cosine_similarity(X,query_vec).transpose()

In [20]:
highest_indices = np.argsort(results)[:,-1:]

In [75]:
from collections import Counter

pred = []
pred_top_n = []
for indexes in highest_indices:
    data = list(df_train.iloc[indexes,1])
    pred_top_n.append(data)
    counts = Counter(data) # TODO in case there is an equality
    most_common = max(counts, key=counts.get)
    pred.append(most_common)
    

In [76]:
index = 1
print(pred[index])
print(df_test.loc[index].values)
print(pred_top_n[index])

Entertainment
['The impact of overpopulation on the environment is a topic of ongoing research.']
['Entertainment']


### tf-idf improved

In [17]:
df_train_plus = pd.read_csv("data/backtrans_eda_train_set.txt",names=['labels','sentences'],sep='\t')
print(df_train_plus.shape)
df_train_plus.head()

(31212, 2)


Unnamed: 0,labels,sentences
0,0,the mayor announced deoxyadenosine monophospha...
1,0,the mayor announced a new to improve public de...
2,0,the mayor new a announced to improve public tr...
3,0,improve mayor announced a new to the public tr...
4,0,the mayor announced a new to improve public co...


In [18]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_train_plus['sentences'])

In [19]:
query_vec = vectorizer.transform(df_test.loc[:,"sentences"])
results = cosine_similarity(X,query_vec).transpose()

In [20]:
highest_indices = np.argsort(results)[:,-1:]

In [34]:
df_test.loc[:,"sentences"]

0       The role of credit scores in lending decisions...
1       The impact of overpopulation on the environmen...
2       The importance of the scientific method in con...
3       The startup accelerator provides funding and m...
4       The benefits of biomimicry are many, including...
                              ...                        
1135    The rock band's farewell tour marks the end of...
1136    The impact of the gig economy on travel experi...
1137    The importance of food safety cannot be overst...
1138    The American Cancer Society recommends avoidin...
1139    The quantum computer processes complex calcula...
Name: sentences, Length: 1140, dtype: object

In [33]:
df_train_plus.loc[highest_indices.flatten(),'labels']

19607    7
9427     3
21728    8
24621    9
16558    6
        ..
1511     0
8560     3
5399     2
7097     2
13657    5
Name: labels, Length: 1140, dtype: int64

In [28]:
index = 1
print(pred[index])
print(df_test.loc[index].values)
print(pred_top_n[index])

the has been severely pandemic impact
['The impact of overpopulation on the environment is a topic of ongoing research.']
['the has been severely pandemic impact']


### Embedding

In [23]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [28]:
embeddings_test = model.encode(list(df_test["sentences"]))
embedding_train = model.encode(list(df_train['sentences']))

results = cosine_similarity(embedding_train,embeddings_test).transpose()
print(df_train.loc[np.argmax(results,axis=1),"labels"])

17      Education
18    Environment
26        Science
8         Finance
26        Science
         ...     
23        Fashion
10         Travel
10         Travel
4          Health
25        Science
Name: labels, Length: 1140, dtype: object


In [31]:
list(df_test.sentences)

['The role of credit scores in lending decisions is significant.',
 'The impact of overpopulation on the environment is a topic of ongoing research.',
 'The importance of the scientific method in conducting research cannot be overemphasized.',
 'The startup accelerator provides funding and mentorship to help early-stage companies grow.',
 'The benefits of biomimicry are many, including potential for developing sustainable technologies and improving efficiency.',
 'The nanotechnology research has potential applications in electronics and materials science.',
 'The impact of tax reform on the economy is a topic of ongoing debate.',
 'The impact of demographic changes on the economy is a topic of concern.',
 'The benefits of using digital fashion in fashion are many, including reduced waste and improved creativity.',
 'The theater company collaborates with local schools to bring the arts to underserved communities.',
 'The importance of portion control in maintaining a healthy weight cann