In [5]:
import torch
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
#printing data from json file



# Specify the path to the JSON file
file_path = "data/train.json"

# Open the JSON file
with open(file_path) as file:
    # Load the JSON data
    data = json.load(file)

for element in data:
    print(element)


Politics
Health
Finance
Travel
Food
Education
Environment
Fashion
Science
Sports
Technology
Entertainment


We have three examples per category which is not a lot. The examples are short sentences which makes it almost impossible for td-idf and word2vec to work properly. Training seems difficult so we could try few shot learning for example, we could use a model as encoder and then usethe right distance to compare to the few examples we have. 

In [19]:
#Premiers tests avec TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a list of sentences from json file 

sentences = []

for element in data:
    for i in data[element]:
        sentences.append(i)

#print(sentences)

In [41]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)

In [42]:

query = "The role of credit scores in lending decisions is significant."
query_vec = vectorizer.transform([query])
results = cosine_similarity(X,query_vec)



highest_indices = np.argsort(results.flatten())[-5:]
highest_values = results.flatten()[highest_indices]

sorted_indices = np.argsort(results.flatten())
sorted_results = results.flatten()[sorted_indices]

print(sorted_results)
print(highest_values)


for i in highest_indices:
    print(sentences[i])





[0.         0.         0.         0.02763934 0.03092786 0.03299303
 0.03318711 0.04813076 0.06174145 0.06223583 0.06223583 0.07598174
 0.08656039 0.09263897 0.09690515 0.09890436 0.1102567  0.11777686
 0.12890202 0.13902519 0.14353748 0.14738137 0.15277842 0.15362058
 0.15486086 0.17471614 0.18756061 0.20357324 0.20409224 0.21248346
 0.21257964 0.21529027 0.25866692 0.28205742 0.28772251 0.34645899]
[0.21529027 0.25866692 0.28205742 0.28772251 0.34645899]
The annual Met Gala is a major event in the fashion world.
The stock market saw a significant drop following the announcement.
The Grammy Awards are a celebration of the best music of the year.
The latest season of Game of Thrones had fans on the edge of their seats.
Climate change is causing a significant rise in sea levels.


Classification doesn't seem to work properly which seems logical because there are no common words between query and the sentences on which the model has been trained. The "documents" are too short and doesn't contain enough elements to make it work.  Let's try wordTovec on our training set. 

In [None]:
from gensim.models import Word2Vec
from multiprocessing import cpu_count

cpu = cpu_count()
print('The virtual instance has {} cpus, that will be used to train the word2vec model'.format(cpu))

# We will just get the "WordVectors" parameter from the trained Word2Vec model.
# Otherwise, we could continue training with some more exemples that could be
# fed on the fly to the model.
print("Training the W2V ...")
pol = Word2Vec(, vector_size=100, window=5, min_count=3, workers=cpu)
pol.train(cleaned_pol, total_examples=len(cleaned_pol), epochs=10)
pol_wv = pol.wv


Trying to use a pre-trained model from hugging face as a few shot classifier. 

In [29]:
#Let's make a first try with distilbert 

from transformers import DistilBertForMaskedLM, DistilBertConfig, DistilBertTokenizer, DistilBertModel

config = DistilBertConfig(vocab_size_or_config_json_file = "config.json")

model =  DistilBertModel(config)

dictio = torch.load("pytorch_model.bin", map_location=torch.device('cpu'))

#We remove "distilbert" form the fields names because it is not in the model state_file 
dictio = {k.replace("distilbert.",""):v for k,v in dictio.items()}

#We remove the pre-classifier and classifier weights
dictio = {k:v for k,v in dictio.items() if "pre_classifier" not in k and "classifier" not in k} 



model.load_state_dict(dictio)
model.eval()

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")



In [30]:
#We now test the model on a sentence 

sentence = "The role of credit scores in lending decisions is significant."

inputs = tokenizer(sentence, return_tensors="pt")

outputs = model(**inputs)

print(outputs.last_hidden_state.shape)

#We can now use the last hidden state to compute the cosine similarity between the sentence and the sentences in the json file

#We will use the last hidden state of the [CLS] token to represent the sentence

sentence_embedding = outputs.last_hidden_state[0][0]

print(sentence_embedding.shape)

#We will now compute the cosine similarity between the sentence and the sentences in the json file

cos = torch.nn.CosineSimilarity(dim=0)

cosine_similarities = []

for i in range(len(sentences)):

    inputs = tokenizer(sentences[i], return_tensors="pt")

    outputs = model(**inputs)

    ground_truth_sentence_embedding = outputs.last_hidden_state[0][0]

    cosine_similarities.append(cos(sentence_embedding, ground_truth_sentence_embedding))

cosine_similarities = torch.tensor(cosine_similarities)

highest_indices = torch.argsort(cosine_similarities, descending=True)[:1]

print(highest_indices)

for i in highest_indices:
    print(sentences[i])

    

torch.Size([1, 13, 768])
torch.Size([768])
tensor([7])
Investing in real estate can be a profitable venture if done correctly.


In [11]:
#Let's create a class for the whole few-shot classifier 

from transformers import DistilBertForMaskedLM, DistilBertConfig, DistilBertTokenizer, DistilBertModel

class FewShotClassifier:

    def __init__(self):
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.model = self.model_loader()
        self.ground_truth, self.sentences = self.load_train_set() #the dictionnary (labels and sentences) and the list of sentences 

    def model_loader(self):

        """This function loads the model from the files and returns it."""

        config = DistilBertConfig(vocab_size_or_config_json_file = "config.json")

        model =  DistilBertModel(config)

        dictio = torch.load("pytorch_model.bin", map_location=torch.device('cpu'))

        #We remove "distilbert" form the fields names because it is not in the model state_file 
        dictio = {k.replace("distilbert.",""):v for k,v in dictio.items()}

        #We remove the pre-classifier and classifier weights
        dictio = {k:v for k,v in dictio.items() if "pre_classifier" not in k and "classifier" not in k} 

        model.load_state_dict(dictio)
        
        return model.eval()


    def load_train_set(self) :

        """This function loads the training set from the json file and returns it."""

        file_path = "data/train.json"

        with open(file_path) as file:
            data = json.load(file)

        sentences = []

        for element in data:
            for i in data[element]:
                sentences.append(i)

        return data, sentences


    def predicition(self, sentence) -> str:

        """This function takes a sentence as input and returns the label of the closest ground_truth sentence in the latent space"""

        inputs = self.tokenizer(sentence, return_tensors="pt")

        outputs = self.model(**inputs)

        #We can now use the last hidden state to compute the cosine similarity between the sentence and the sentences in the json file

        #We will use the last hidden state of the [CLS] token to represent the sentence

        sentence_embedding = outputs.last_hidden_state[0][0]

        #We will now compute the cosine similarity between the sentence and the sentences in the json file

        cos = torch.nn.CosineSimilarity(dim=0)

        cosine_similarities = []

        for i in range(len(self.sentences)):

            inputs = self.tokenizer(self.sentences[i], return_tensors="pt")

            outputs = self.model(**inputs)

            ground_truth_sentence_embedding = outputs.last_hidden_state[0][0]

            cosine_similarities.append(cos(sentence_embedding, ground_truth_sentence_embedding))

        cosine_similarities = torch.tensor(cosine_similarities)

        highest_indices = torch.argsort(cosine_similarities, descending=True)[:10]

        results = {
        "Politics": 0,
        "Health": 0,
        "Finance": 0,
        "Travel": 0,
        "Food": 0,
        "Education": 0,
        "Environment": 0,
        "Fashion": 0,
        "Science": 0,
        "Sports": 0,
        "Technology": 0,
        "Entertainment": 0
        }

        #The chosen label will be the one which first has 2 of its sentences in the top closest sentences

        max_value = 0
        i = 0

        while max_value < 2:
            for key, value in self.ground_truth.items():
                if self.sentences[highest_indices[i]] in value:
                    results[key] += 1
                    i += 1
                    break
            max_value = max(results.values())

        max_key = max(results, key=lambda k: results[k])

        return max_key


In [2]:
#Open test_shuffle.txt file and add each row to a list 

file_path = "data/test_shuffle.txt"

with open(file_path) as file:
    test = file.readlines()

print(test)

#Create the test set class 

class TestSet:

    def __init__(self, test):
        self.test = test

    def __len__(self):
        return len(self.test)

    def __getitem__(self, idx):
        return self.test[idx]


['The role of credit scores in lending decisions is significant.\n', 'The impact of overpopulation on the environment is a topic of ongoing research.\n', 'The importance of the scientific method in conducting research cannot be overemphasized.\n', 'The startup accelerator provides funding and mentorship to help early-stage companies grow.\n', 'The benefits of biomimicry are many, including potential for developing sustainable technologies and improving efficiency.\n', 'The nanotechnology research has potential applications in electronics and materials science.\n', 'The impact of tax reform on the economy is a topic of ongoing debate.\n', 'The impact of demographic changes on the economy is a topic of concern.\n', 'The benefits of using digital fashion in fashion are many, including reduced waste and improved creativity.\n', 'The theater company collaborates with local schools to bring the arts to underserved communities.\n', 'The importance of portion control in maintaining a healthy w

In [3]:
# Create DataLoader
from torch.utils.data import DataLoader
args = {'bsize': 64}
data_loader = DataLoader(TestSet(test), batch_size=args['bsize'], num_workers=0, shuffle=False)

In [12]:
Classifier = FewShotClassifier()

In [13]:
#Test de la classe 

print(Classifier.predicition("The role of credit scores in lending decisions is significant."))

Finance


In [14]:
#Premier test visuel

for data in data_loader :
    for i in data:
        print("{} / {}".format(i, Classifier.predicition(i)))
    break



The role of credit scores in lending decisions is significant.
 / Finance
The impact of overpopulation on the environment is a topic of ongoing research.
 / Environment
The importance of the scientific method in conducting research cannot be overemphasized.
 / Science
The startup accelerator provides funding and mentorship to help early-stage companies grow.
 / Finance
The benefits of biomimicry are many, including potential for developing sustainable technologies and improving efficiency.
 / Environment
The nanotechnology research has potential applications in electronics and materials science.
 / Science
The impact of tax reform on the economy is a topic of ongoing debate.
 / Finance
The impact of demographic changes on the economy is a topic of concern.
 / Finance
The benefits of using digital fashion in fashion are many, including reduced waste and improved creativity.
 / Technology
The theater company collaborates with local schools to bring the arts to underserved communities.
 /