In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import torch

In [2]:
from fewshot.data.loaders import (
    load_or_cache_data,
    _load_agnews_dataset,
    _create_dataset_from_df,
)

from fewshot.data.utils import select_subsample, expand_labels

from fewshot.eval import predict_and_score

from fewshot.utils import fewshot_filename, torch_load, torch_save, pickle_load

from fewshot.models.few_shot import (
    FewShotLinearRegression,
    BayesianMSELoss,
    prepare_dataloader,
    train,
)

In [3]:
filename = "../data/bulletin-dataset-tm-fsl.csv"
test_filename = "../data/test-bulletin-dataset-tm-fsl.csv"

In [4]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,label,text,category
0,0,The Associate offices ordinarily do not issue ...,legislation
1,0,The taxpayer is strongly encouraged to inform ...,legislation
2,0,"When filing the request, the taxpayer must ide...",legislation
3,0,The taxpayer also must notify the Associate of...,legislation
4,0,The question must be on the interpretation and...,legislation


In [5]:
df_test = pd.read_csv(test_filename)
df_test.head()

Unnamed: 0,label,text,category
0,0,Are copies of the enacting legislation providi...,legislation
1,0,The question must be on the interpretation and...,legislation
2,2,Whether § 302(b) applies to a redemption of st...,corporation
3,2,The tax effects of a transaction in which ther...,corporation
4,1,The Office of Associate Chief Counsel (Tax Exe...,compensation


In [6]:
DATASET_NAME = "bulletin-dataset-tm-fsl"
DATADIR = f"../data/{DATASET_NAME}"

# convert that DataFrame to a Dataset
ds_filename = fewshot_filename(f"{DATADIR}/{DATASET_NAME}_train_dataset.pkl")

if os.path.exists(ds_filename):
    df_train_subset = pickle_load(ds_filename)
else:
    df_train_subset = _create_dataset_from_df(
        df, text_column="text", filename=ds_filename
    )

In [7]:
ds_filename = fewshot_filename(f"{DATADIR}/{DATASET_NAME}_test_dataset.pkl")

df_test_subset = _create_dataset_from_df(
    df_test, text_column="text", filename=ds_filename
)
if os.path.exists(ds_filename):
    test_dataset = pickle_load(ds_filename)
else:
    test_dataset = _create_dataset_from_df(
        df, text_column="text", filename=ds_filename
    )

Computing sentence representations: 100%|████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.79s/it]

C:\Users\UIF13879\Documents\Anuj Shah\School\ISR\Project\Final Project\Few-Shot-Learning-with-Topic-Modeling\few-shot-learning-II\few-shot-text-classification\..\data\bulletin-dataset-tm-fsl\bulletin-dataset-tm-fsl_test_dataset.pkl already exists! Please use overwrite flag.





In [8]:
# this is required due the particular implementation details of our Dataset class
df_train_subset = expand_labels(df_train_subset)

In [9]:
Zmap = torch.load(fewshot_filename("data/maps/Zmap_20000_words.pt"))

In [10]:
data_loader = prepare_dataloader(df_train_subset, Zmap)

In [11]:
# instantiate the model
device = "cpu"
learning_rate = 0.1
lambda_regularization = 1500
num_epochs = 2000

In [12]:
fewshot_model = FewShotLinearRegression(
    Zmap.size()[1],
    Zmap.size()[1],
    loss_fcn=BayesianMSELoss(device=device),
    lr=learning_rate,
    device=device,
)

In [13]:
loss_history = train(
    fewshot_model, data_loader, num_epochs=num_epochs, lam=lambda_regularization
)

In [14]:
Wmap = fewshot_model.linear.weight.detach().cpu()

In [15]:
score = predict_and_score(
    test_dataset, linear_maps=[Zmap, Wmap], return_predictions=False
)
print(score)

50.0
