In [76]:
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [77]:
##Setting display option
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [78]:
df=pd.read_parquet("https://storage.googleapis.com/safechat-gcs/safechat_data.parquet")
df.drop('created_at',axis=1,inplace=True)
df.head()

Unnamed: 0,text,removed
0,I've always been taught it emerged from the earth after an impace. That is why it has similar elemental distribution to earth,0
1,"As an ECE, my first feeling as ""HEY THAT'S NOT-"" and then I thought about all the times my co-workers couldn't even write a simple message in our communication book without making mistakes. \r\n\r\nI'm getting out of the profession.",1
2,Monday: Drug companies stock dives on good news for patients.,1
3,i learned that all hybrids are unfertile i wont read clickbaits https://biology.stackexchange.com/questions/16922/why-are-hybrids-infertile,0
4,Well i was wanting to get wasted tonight. Not so much after reading this article...,0


In [79]:
## Split the DataFrame into training and test sets
train_df = df.sample(frac=0.8, random_state=42)  ## 80% for training
train_df=train_df[['text','removed']].reset_index()
test_df = df.drop(train_df.index)  ## Remaining 20% for testing
train_df=train_df[['text','removed']].reset_index()

In [81]:
train_df.head()

Unnamed: 0,index,text,removed
0,0,This is why 1st person shooters are inaccurate,0
1,1,"The supposition here is absurd: That ""traditional male qualities"" don't place value on altruism and self care.\r\n\r\nThis ""study"" is absolute trash.",0
2,2,explain the 52% divorce rate then.,0
3,3,"Dr. Ruff, thank you for taking the time to do this AMA.\r\n\r\nI am a student about to finish his M.S. in Biology. \r\n\r\nFor someone, such as myself, wanting to teach anatomy at the high school or college level, what type of doctoral degrees do you recommend? I am having a hard time finding PhD programs in my area that are in Anatomy/Physiology. Also, do you find yourself having to periodically review any anatomy that you do teach (mechanisms of immunology, muscles in specific areas, etc.)?",0
4,4,Not the least bit surprising. I didn't read this one either.,1


In [72]:
##Converting into dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [73]:
test_dataset[10]

{'text': "Some more studies for those interested: http://openheart.bmj.com/content/2/1/e000196.full and https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4163969/\r\n\r\nFrom the studies and articles, evidence does not support the introduction of the guidelines imposed (30% of daily caloric intake from fats). There are studies that show reduction or modification of saturated fat in the diet have a net reduction in cardiovascular events by ~14% though. \r\n\r\nAlso, some videos from Healthcare Triage:\r\n\r\n[The Evidence for Low-Fat Diets Isn't Really There](https://www.youtube.com/watch?v=oIPl5xPYJJU)\r\n\r\n[Cholesterol Isn't Quite as Bad as You've Been Told](https://www.youtube.com/watch?v=qtqHFLcCVSs)\r\n\r\n[Trans Fats, Sugary Soda, and Effective Regulation](https://www.youtube.com/watch?v=6XDV3IwnF8Q)",
 'removed': 0}

In [36]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [37]:
##Tokenizing the words
test_dataset = test_dataset.map(preprocess_function, batched=True)
train_dataset = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/223372 [00:00<?, ? examples/s]

In [46]:
##dynamically pad the sentences to the longest length in a batch during collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:
accuracy = evaluate.load("accuracy")

In [49]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [50]:
id2label = {0: "Not Removed", 1: "Removed"}
label2id = {"Not Removed": 0, "Removed": 1}

In [52]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.we

In [None]:
training_args = TrainingArguments(
    output_dir="model_weights",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [53]:
# ##Splitting data into train and test
# x = df[['text']]
# y = df["removed"].values

# X_train, X_test, y_train, y_test = train_test_split(
#     x,y,
#     stratify=y,
#     random_state=69,
#     test_size=0.1, shuffle=True)
# print("Shape of X_train :",X_train.shape)
# print("Shape of X_test :",X_test.shape)
# print("Shape of y_train :",y_train.shape)
# print("Shape of y_test :",y_test.shape)