In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import pandas as pd
import evaluate
import numpy as np
from datasets import Dataset

In [1]:
from sklearn.metrics import f1_score, confusion_matrix

In [4]:
def clean_dataset(filename, small_segment=True):
    dataset = pd.read_csv(filename)
    dataset = dataset[["segment_25","Class"]]
    dataset = dataset.rename(columns = {"segment_25":"text","Class":"label"})
    return dataset

dataset = clean_dataset('training_data_segmented.csv')
dataset.head()

Unnamed: 0,text,label
0,"['; Gus Smith, 21, shot by police In the attem...",1
1,"['and of gunmen, wbleh the police said numbere...",0
2,"['LPHIA - (AP) Hundreds of police, aided by st...",0
3,"[""dent of Michigan's black police association,...",1
4,"['n, maintenance, military police and medical ...",0


In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased2')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased2')

In [6]:
def tokenize(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [7]:
dataset = Dataset.from_pandas(dataset)
dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
ds = dataset.train_test_split(test_size = .2, shuffle=True, seed=42)

In [9]:
metric = evaluate.load('accuracy.py')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    cm = confusion_matrix(labels, predictions)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return {"f1": f1, "confusion_matrix": cm, "accuracy": accuracy}

In [10]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    output_dir = "police_classifier",
    eval_strategy = "epoch",
    push_to_hub = False)


trainer = Trainer(
    model=model,
    args = train_args,
    train_dataset = ds['train'],
    eval_dataset = ds['test'],
    compute_metrics = compute_metrics)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,F1,Confusion Matrix,Accuracy
1,No log,0.585561,0.681445,[[ 4 8]  [ 0 17]],{'accuracy': 0.7241379310344828}
2,No log,0.538919,0.726127,[[ 9 3]  [ 5 12]],{'accuracy': 0.7241379310344828}
3,No log,0.488841,0.783303,[[ 7 5]  [ 1 16]],{'accuracy': 0.7931034482758621}


TypeError: Object of type ndarray is not JSON serializable

In [11]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.4888414442539215, 'eval_f1': 0.7833030852994555, 'eval_confusion_matrix': array([[ 7,  5],
       [ 1, 16]]), 'eval_accuracy': {'accuracy': 0.7931034482758621}}


In [10]:
sample = pd.read_csv('validation_data.csv')
sample = sample[["segment_25"]]
sample = sample.rename(columns = {"segment_25":"text"})

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased2')
model = AutoModelForSequenceClassification.from_pretrained('bert25_classification')

sample = Dataset.from_pandas(sample)
sample = sample.map(tokenize, batched=True)


  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
clf = TextClassificationPipeline(model=model, tokenizer=tokenizer)

results = clf(sample['text'])

for text, result in zip(sample['text'], results):
    print(f"Text: {text}, Prediction: {result}")

Device set to use cpu


Text: ['7:30 p.m. to 9 p.m. TITLEOFFICER OFFICERA Title Insurance Company'], Prediction: {'label': 'LABEL_0', 'score': 0.7746497988700867}
Text: ['ork April 15 to elect theofficerofficer. The 1936 grots operatin'], Prediction: {'label': 'LABEL_0', 'score': 0.8294180631637573}
Text: ['MEANWHILE, in Amritsar, apolice policeofficial and a Sikh docto', 'id the doctor. The deputypolice policesuperintendent, who helpe', 're giving instructions topolicepolice, judiciary and other off'], Prediction: {'label': 'LABEL_0', 'score': 0.7229017615318298}
Text: ['Sam" Bernstein, both withpolice policerecords, are back in the '], Prediction: {'label': 'LABEL_0', 'score': 0.7942268252372742}
Text: [' Ohio 45891-0151 SECURITYOFFICEROFFICERS $300 Employment Bonus N'], Prediction: {'label': 'LABEL_0', 'score': 0.8049235939979553}
Text: ['uals may face approachingpolicepolice. Duggan said the efforts', ' go beyond policepolice, that outreach outreach ', 'h the outstanding job thepolice policedepartment is 

In [None]:
print(results)

In [12]:
df = pd.DataFrame.from_dict(results)
df.head()

Unnamed: 0,label,score
0,LABEL_0,0.77465
1,LABEL_0,0.829418
2,LABEL_0,0.722902
3,LABEL_0,0.794227
4,LABEL_0,0.804924


In [13]:
df2 = pd.read_csv('validation_data.csv')
evals = pd.concat([df2,df], axis=1)
evals.head()


Unnamed: 0.1,Unnamed: 0,GOID,Title,Text,Date,segment_25,segment_50,label,score
0,990,1818420866,"May 30, 1961 (Page 15 of 20)","Death Notice NIAKHAKOS POTA, 43, of J9M Lakawo...",1961-05-30,['7:30 p.m. to 9 p.m. TITLEOFFICER OFFICERA Ti...,['U 1-5640. Call Wednesday 7:30 p.m. to 9 p.m....,LABEL_0,0.77465
1,12645,1816373616,"April 11, 1937 (Page 64 of 128)",14 THE DETROIT FREE PRESS SUNDAY. APRIL II. HJ...,1937-04-11,['ork April 15 to elect theofficerofficer. The...,['latter will meet in New York April 15 to ele...,LABEL_0,0.829418
2,20558,1822566298,"June 14, 1984 (Page 12 of 246)","12A DETROIT FREE PRESSTHURSDAY, JUNE 14, 1984 ...",1984-06-14,"['MEANWHILE, in Amritsar, apolice policeoffici...","['the holiest Sikh shrine. MEANWHILE, in Amrit...",LABEL_0,0.722902
3,21480,1816748409,"August 12, 1945 (Page 3 of 88)",Ace Red Chiefs WTio Beat Nazis Direct War on J...,1945-08-12,"['Sam"" Bernstein, both withpolice policerecord...","['es that Louis and ""Black Sam"" Bernstein, bot...",LABEL_0,0.794227
4,23052,1822832574,"November 16, 1987 (Page 46 of 92)","6D DETROIT FREE PRESSMONDAY, NOVEMBER new busi...",1987-11-16,[' Ohio 45891-0151 SECURITYOFFICEROFFICERS $30...,"['NY P.O. Box 151 Van Wert, Ohio 45891-0151 SE...",LABEL_0,0.804924


In [14]:
evals.to_csv("bert25_evals.txt", index=False)

In [15]:
data_to_export = "bert25_evals.txt"

upload: ./bert25_evals.txt to s3://pq-tdm-studio-results/tdm-ale-data/a2535/results/bert25_evals.txt
