In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import evaluate
import numpy as np


In [2]:
def clean_dataset(filename):
    dataset = pd.read_csv(filename)
    dataset = dataset[["corrected_text","Class"]]
    dataset = dataset.rename(columns = {"corrected_text":"text","Class":"label"})
    return dataset

dataset = clean_dataset('post_correction.csv')
dataset.head()

Unnamed: 0,text,label
0,"1. Gus Smith, 21, shot by police in the attemp...",1
1,"So, the police said, the gunmen numbered betwe...",0
2,"3 'LPHIA - (AP) Hundreds of police, aided by s...",0
3,by members of the police department's controve...,1
4,"['n, maintenance, military police and medical ...",0


In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased2')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased2')

In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)


def tokenize(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
mds = dataset.train_test_split(test_size = .2, shuffle=True, seed=42)

In [6]:
metric = evaluate.load('accuracy.py')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [7]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    output_dir = "police_classifier",
    eval_strategy = "epoch",
    push_to_hub = False)


trainer = Trainer(
    model=model,
    args = train_args,
    train_dataset = mds['train'],
    eval_dataset = mds['test'],
    compute_metrics = compute_metrics)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.661461,0.62069
2,No log,0.729905,0.586207
3,No log,0.748531,0.62069


TrainOutput(global_step=42, training_loss=0.5537548065185547, metrics={'train_runtime': 375.0937, 'train_samples_per_second': 0.896, 'train_steps_per_second': 0.112, 'total_flos': 88405314600960.0, 'train_loss': 0.5537548065185547, 'epoch': 3.0})

## Test Data

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
sample = pd.read_csv('validation_data.csv')
sample = sample[["segment_25"]]
sample = sample.rename(columns = {"segment_25":"text"})

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased2')
model = AutoModelForSequenceClassification.from_pretrained('bert25_classification')

sample = Dataset.from_pandas(sample)
sample = sample.map(tokenize, batched=True)