# Loading the data

In [2]:
import pandas as pd
import numpy as np
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv("Datasets/news_processed_spacy.csv")
data.head()

Unnamed: 0,text,polarity,n_tokens,total_n_ents,n_org_ents,n_per_ents,n_gpe_ents,n_norp_ents,n_date_ents,entities,org_ents,per_ents,gpe_ents,norp_ents,date_ents
0,Kerry to go to Paris in gesture of sympathyU.S...,True,495,58,2,17,23,6,10,"['Kerry', 'Paris', 'State', 'John F. Kerry', '...","['State', 'the American Embassy']","['Kerry', 'John F. Kerry', 'Kerry', 'Laurent F...","['Paris', 'Paris', 'Paris', 'France', 'Sofia',...","['American', 'French', 'Israeli', 'European', ...","['Monday', 'later this week', 'Sunday', 'Thurs..."
1,The Battle of New York: Why This Primary Matte...,True,405,42,12,14,9,4,3,"['New York', 'Hillary Clinton', 'Donald Trump'...","['Trump', 'the White House', 'Trump', 'Sanders...","['Hillary Clinton', 'Donald Trump', 'Ted Cruz'...","['New York', 'Ohio', 'New York', 'New York', '...","['Republican', 'Republican', 'Democratic', 'In...","['year', 'this weekend', 'November']"
2,‘Britain’s Schindler’ Dies at 106A Czech stock...,True,148,24,4,4,8,4,4,"['Britain', 'Schindler’ Dies', 'Czech', 'Jewis...","['Schindler’ Dies', 'Winton', 'Winton', 'Winton']","['Dubbed “Britain’s Schindler', 'Nicholas Wint...","['Britain', 'Nazi Germany', 'Prague', 'Germany...","['Czech', 'Jewish', 'Jewish', 'German']","['the age of 106', 'March 1939', '2003', 'near..."
3,Fact check: Trump and Clinton at the 'commande...,True,2861,298,103,87,60,12,36,"['Trump', 'Clinton', 'Clinton', 'Donald Trump'...","['Trump', 'NBC', 'Trump', 'Obama', 'Trump', 't...","['Clinton', 'Clinton', 'Donald Trump', '• Clin...","['Iraq', 'Iraq', 'Obama', 'China', 'Saudi Arab...","['Republicans', 'Democratic', 'Republicans', '...","['Sept. 7', 'Today', 'Sept. 11, 2002', 'about ..."
4,Iran reportedly makes new push for uranium con...,True,813,77,12,7,39,4,15,"['Iran', 'U.S.', 'Iran', 'the final days', 'Th...","['The New York Times', 'Times', 'The Associate...","['Olli Heinonen', 'John Kerry', 'Edward Kenned...","['Iran', 'U.S.', 'Iran', 'Tehran', 'Russia', '...","['Western', 'French', 'German', 'Iranian']","['the final days', 'late Sunday', 'just two da..."


In [5]:
data_clean = data [["text", "polarity"]]
data_clean.head()

Unnamed: 0,text,polarity
0,Kerry to go to Paris in gesture of sympathyU.S...,True
1,The Battle of New York: Why This Primary Matte...,True
2,‘Britain’s Schindler’ Dies at 106A Czech stock...,True
3,Fact check: Trump and Clinton at the 'commande...,True
4,Iran reportedly makes new push for uranium con...,True


In [8]:
data_clean.replace({"True": 1, "Fake":0}, inplace=True)

  data_clean.replace({"True": 1, "Fake":0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.replace({"True": 1, "Fake":0}, inplace=True)


In [9]:
data_clean["polarity"].value_counts()

polarity
1    3171
0    3164
Name: count, dtype: int64

In [10]:
data_clean.shape

(6335, 2)

## Splitting the Data

In [14]:
import random

def random_indexes(number, percentage, df, used_indexes=False, used_indexes_2=False):
    
    """Manually sampling the indexes of the dataframe taking into account the previously sampled indexes 
    
    """
    possible_indexes = list(df.index)
    
    if used_indexes:
        possible_indexes = [index for index in possible_indexes if index not in used_indexes]
    if used_indexes_2:
        possible_indexes = [index for index in possible_indexes if index not in used_indexes_2]
    
    df_sampleable = df[df.index.isin(possible_indexes)]
    
    n_samples = int(percentage*number)
    df_sample = df_sampleable.sample(n_samples)
    
    return df_sample

training_portion = data_clean.copy()
testing_portion = data_clean.copy()
validation_portion = data_clean.copy()

def is_double(list_1, list_2, list_3):
    same_index = list(set(list_1).intersection(set(list_2), set(list_3)))
    if len(same_index) != 0:
        return True
    else:
        return False

while is_double(list(training_portion.index), list(testing_portion.index), list(validation_portion.index)) == True:
    training_portion = random_indexes(6335, 0.7, training_portion)
    testing_portion = random_indexes(6335, 0.15, testing_portion, used_indexes = list(training_portion.index))
    validation_portion = random_indexes(6335, 0.15, validation_portion, used_indexes = list(training_portion.index), used_indexes_2 = list(testing_portion.index))
    

In [15]:
same_index = list(set(training_portion.index).intersection(set(testing_portion.index), set(validation_portion.index)))
same_index

[]

## Transforming the DataFrames into DataLoaders

In [20]:
id2label = {"True": 1, "Fake":0}
label2id = {value: key for key, value in id2label.items()}

print(id2label)
print(label2id)

{'True': 1, 'Fake': 0}
{1: 'True', 0: 'Fake'}


In [17]:
from datasets import Dataset

train_dataloader = Dataset.from_pandas(training_portion)
validation_dataloader = Dataset.from_pandas(validation_portion)
test_dataloader = Dataset.from_pandas(testing_portion)


In [21]:
train_dataloader

Dataset({
    features: ['text', 'polarity', '__index_level_0__'],
    num_rows: 4434
})

In [22]:
train_dataloader[0]

{'text': 'IF HILLARY CLINTON IS CHARGED WITH OBSTRUCTION OF JUSTICE SHE COULD GO TO PRISON FOR 20 YEARSHome › POLITICS › IF HILLARY CLINTON IS CHARGED WITH OBSTRUCTION OF JUSTICE SHE COULD GO TO PRISON FOR 20 YEARS IF HILLARY CLINTON IS CHARGED WITH OBSTRUCTION OF JUSTICE SHE COULD GO TO PRISON FOR 20 YEARS 0 SHARES \n[10/31/16] MICHAEL SNYDER -In the world of politics, the cover-up is often worse than the original crime. It was his role in the Watergate cover-up that took down Richard Nixon, and now Hillary Clinton’s cover-up of her email scandal could send her to prison for a very, very long time. When news broke that the FBI has renewed its investigation into Hillary Clinton’s emails, it sent shockwaves throughout the political world . But this time around, we aren’t just talking about an investigation into the mishandling of classified documents. I haven’t heard anyone talking about this, but if the FBI discovers that Hillary Clinton altered, destroyed or concealed any emails that 

# Transformer Model

In [32]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline

model = "google-bert/bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
    )

batch_size = 32

tokenized_train_dataloader = train_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)
tokenized_validation_dataloader = validation_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)
tokenized_test_dataloader = test_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)

Map: 100%|██████████| 4434/4434 [00:03<00:00, 1350.47 examples/s]
Map: 100%|██████████| 950/950 [00:00<00:00, 1527.95 examples/s]
Map: 100%|██████████| 950/950 [00:00<00:00, 1506.54 examples/s]


In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
import evaluate

f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def evaluate(prediction_array):
    logits, labels = prediction_array
    predictions = np.argmax(logits, axis=1)
    
    f1_score = f1.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)

    return f1_score, precision_score, recall_score

Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<?, ?B/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 7.55MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<?, ?B/s]


In [34]:
num_epochs = 30
learning_rate = 0.00005

training_args = TrainingArguments(
    output_dir="Transformer Model",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataloader,
    eval_dataset=tokenized_validation_dataloader,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluate
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
[codecarbon INFO @ 01:18:02] [setup] RAM Tracking...
[codecarbon INFO @ 01:18:02] [setup] GPU Tracking...
[codecarbon INFO @ 01:18:02] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 01:18:02] [setup] CPU Tracking...
[codecarbon INFO @ 01:18:03] CPU Model on constant consumption mode: AMD Ryzen 7 5800HS with Radeon Graphics
[codecarbon INFO @ 01:18:03] >>> Tracker's metadata:
[codecarbon INFO @ 01:18:03]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 01:18:03]   Python version: 3.12.0
[codecarbon INFO @ 01:18:03]   CodeCarbon version: 2.3.3
[codecarbon INFO @ 01:18:03]   Available RAM : 15.406 GB
[codecarbon INFO @ 01:18:03]   CPU count: 16
[codecarbon INFO @ 01:18:03]   CPU model: AMD Ryzen 7 5800HS with Radeon Graphics
[codecarbon INFO @ 01:18:03]   GPU count: 1
[codecarbon INFO @ 01:18:03]   GPU model: 1 x NVIDIA GeForce RTX 3050 Laptop GPU


In [35]:
# training_results = trainer.train()

  0%|          | 0/4170 [00:00<?, ?it/s]

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\alber\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\interactiveshell.py", line 2105, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\alber\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\ultratb.py", line 1396, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\alber\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\ultratb.py", line 1287, in structured_traceback
    return VerboseTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\alber\AppData\Local\Programs\Python\Python312\Lib\site-packages\IPython\core\ultratb.py", line 1140, in structured_traceback
    formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,
          

[codecarbon INFO @ 01:19:17] Energy consumed for RAM : 0.000024 kWh. RAM Power : 5.777113437652588 W
[codecarbon INFO @ 01:19:17] Energy consumed for all GPUs : 0.000038 kWh. Total GPU Power : 9.200102397021807 W
[codecarbon INFO @ 01:19:17] Energy consumed for all CPUs : 0.000073 kWh. Total CPU Power : 17.5 W
[codecarbon INFO @ 01:19:17] 0.000135 kWh of electricity used since the beginning.


In [None]:
evaluation = trainer.evaluate(test_dataloader)