In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "./my_model_prompt2"  # or wherever you saved it
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
raw = pd.read_parquet("data/501c3_charity_geocoded_missions_clean.parquet")
data = raw.dropna(subset=['CANONICAL_MISSION'])
print("Number of samples:", len(data))

Number of samples: 567587


In [5]:
def predict_batch(texts, batch_size=16):
    preds = []
    probs = []
    device = "cuda" if torch.cuda.is_available() else "cpu"

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_probs = torch.softmax(logits, dim=-1)
            batch_preds = torch.argmax(batch_probs, dim=-1)

        preds.extend(batch_preds.cpu().numpy())
        probs.extend(batch_probs.cpu().numpy())

    return preds, probs


In [6]:
texts = data["CANONICAL_MISSION"].dropna().tolist()
preds, probs = predict_batch(texts)


  5%|▍         | 1619/35475 [08:34<2:59:24,  3.15it/s]


KeyboardInterrupt: 

In [None]:
data["RELIGIOUS"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["RELIGIOUS"] = preds


In [None]:
data

Unnamed: 0,EIN2,CANONICAL_MISSION,RELIGIOUS
1,EIN-01-0015091,provide soccer instruction to hanover township...,0
2,EIN-01-0017496,the organization's primary exempt purpose is t...,0
3,EIN-01-0024645,the bangor symphony orchestra provides powerfu...,0
4,EIN-01-0078060,henrietta d goodall hospital inc owns and oper...,0
5,EIN-01-0085716,this corporation hancock county agricultural s...,0
...,...,...,...
693041,EIN-99-6064620,to provide perpetual support for designated be...,0
693042,EIN-99-6074970,provide scholarships and grants to children an...,0
693043,EIN-99-6078202,philanthropic and educational projects,0
693044,EIN-99-6078252,"""""the trustee shall distribute all of the net ...",0


In [None]:
data.RELIGIOUS.value_counts()

RELIGIOUS
0    471429
1     96158
Name: count, dtype: int64

In [None]:
data.to_parquet('data/501c3_charity_geocoded_missions_clean_classified_prompt2.parquet', index=False)

In [None]:
# RELIGIOUS
# first model
# 0    471429
# 1     96158

# second model
# 0    488977
# 1     78610