In [1]:
import pandas as pd

In [31]:
df = pd.read_csv("/kaggle/input/news-dataset/augumentated_df.csv")

In [32]:
df = df[['news','political']]

In [33]:
df

Unnamed: 0,news,political
0,Individual investors are piling into Vietnam s...,Center
1,A recent Madras High Court ruling has come as ...,Center
2,Follow us on Google News2019: A year that chan...,Center
3,Nedbank Group Ltd. has committed 50 billion ra...,Center
4,West Bengal Chief Minister Mamata Banerjee on ...,Center
...,...,...
5825,A love jihad has come from in named Firoz lure...,Right
5826,Udhayanidhi Stalin: The new competitor to Rahu...,Right
5827,Udhayanidhi Stalin: The new competitor to Rahu...,Right
5828,Udhayanidhi Stalin: The new competitor to Rahu...,Right


In [34]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation (optional; BERT can handle this)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
df['news'] = df['news'].apply(clean_text)

In [35]:
df

Unnamed: 0,news,political
0,individual investors are piling into vietnam s...,Center
1,a recent madras high court ruling has come as ...,Center
2,follow us on google news2019 a year that chang...,Center
3,nedbank group ltd has committed 50 billion ran...,Center
4,west bengal chief minister mamata banerjee on ...,Center
...,...,...
5825,a love jihad has come from in named firoz lure...,Right
5826,udhayanidhi stalin the new competitor to rahul...,Right
5827,udhayanidhi stalin the new competitor to rahul...,Right
5828,udhayanidhi stalin the new competitor to rahul...,Right


In [36]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=1024)

# Apply tokenization
encoded_texts = tokenizer(df['news'].tolist(), padding=True, truncation=True, return_tensors='pt')


In [37]:
# Extract input_ids and attention_mask
input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']

In [38]:
input_ids

tensor([[  101,  3265,  9387,  ...,  8571,  7570,   102],
        [  101,  1037,  3522,  ...,  2008,  1996,   102],
        [  101,  3582,  2149,  ..., 14093,  1037,   102],
        ...,
        [  101, 20904, 26115,  ...,  5936,  2055,   102],
        [  101, 20904, 26115,  ...,  2591,  3425,   102],
        [  101, 20904, 26115,  ...,     0,     0,     0]])

In [39]:
attention_masks

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])

In [41]:
from sklearn.preprocessing import LabelEncoder

# Assuming labels are in a column 'label'
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['political'])

In [42]:
labels = df['encoded_labels'].tolist()

In [43]:
df

Unnamed: 0,news,political,encoded_labels
0,individual investors are piling into vietnam s...,Center,0
1,a recent madras high court ruling has come as ...,Center,0
2,follow us on google news2019 a year that chang...,Center,0
3,nedbank group ltd has committed 50 billion ran...,Center,0
4,west bengal chief minister mamata banerjee on ...,Center,0
...,...,...,...
5825,a love jihad has come from in named firoz lure...,Right,2
5826,udhayanidhi stalin the new competitor to rahul...,Right,2
5827,udhayanidhi stalin the new competitor to rahul...,Right,2
5828,udhayanidhi stalin the new competitor to rahul...,Right,2


In [115]:
df[3000:3010]

Unnamed: 0,news,political,encoded_labels
3000,national conference candidates farooq abdullah...,Left,1
3001,national abdullah friday campaigning of congre...,Left,1
3002,ddugjy that is not gibberish its an acronym wo...,Left,1
3003,ddugjy that is not gibberish its an acronym wo...,Left,1
3004,ddugjy that is not gibberish its an acronym wo...,Left,1
3005,ddugjy is gibberish acronym worth a cool rs th...,Left,1
3006,madhu trehan welcome to candidates 2014 on fac...,Left,1
3007,madhu trehan welcome to candidates 2014 on fac...,Left,1
3008,madhu trehan welcome to candidates 2014 on fac...,Left,1
3009,madhu to candidates on facebook live here i am...,Left,1


In [45]:
#labels

In [46]:
encoded_texts['input_ids'].shape

torch.Size([5830, 512])

In [47]:
encoded_texts['attention_mask'].shape

torch.Size([5830, 512])

In [49]:
from sklearn.model_selection import train_test_split

# Perform stratified split
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42, stratify=labels
)

In [51]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

    def __len__(self):
        return len(self.labels)


In [52]:
print(input_ids.shape)  # Should be (num_samples, max_length)
print(attention_masks.shape)  # Should be (num_samples, max_length)
print(len(labels))  # Should match the number of samples


torch.Size([5830, 512])
torch.Size([5830, 512])
5830


In [53]:
from torch.utils.data import DataLoader

train_dataset = NewsDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = NewsDataset(test_input_ids, test_attention_masks, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [54]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



2024-08-11 17:35:30.476366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 17:35:30.476510: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 17:35:30.608911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


fc328ac81913508564c61af3d38ac8f6f7feb4e4              ### wandb api

In [55]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Epoch,Training Loss,Validation Loss
1,0.8121,0.569055
2,0.3647,0.23306
3,0.1106,0.125571


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


TrainOutput(global_step=1749, training_loss=0.372625967105366, metrics={'train_runtime': 1084.9724, 'train_samples_per_second': 12.896, 'train_steps_per_second': 1.612, 'total_flos': 3681482940850176.0, 'train_loss': 0.372625967105366, 'epoch': 3.0})

In [70]:
results = trainer.evaluate()
print(results)

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


{'eval_loss': 0.12557117640972137, 'eval_runtime': 18.0064, 'eval_samples_per_second': 64.755, 'eval_steps_per_second': 8.108, 'epoch': 3.0}


In [112]:
# Create a Trainer object
trainer = Trainer(
    model=model,
    args=training_args,  # Define your training arguments here
    eval_dataset=test_dataset,  # Use eval_dataset or test_dataset
    tokenizer=tokenizer
)

# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


{'eval_loss': 0.12557117640972137, 'eval_runtime': 18.4456, 'eval_samples_per_second': 63.213, 'eval_steps_per_second': 7.915}


In [113]:
import torch
from sklearn.metrics import accuracy_score

# Move model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.eval()  # Set model to evaluation mode

all_predictions = []
all_labels = []

with torch.no_grad():  # Disable gradient calculation
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions
        predictions = torch.argmax(logits, dim=-1)

        # Collect predictions and labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Accuracy: {accuracy:.4f}')


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Accuracy: 0.9760


In [109]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a class mapping (as before)
class_mapping = {0: 'center', 1:'left' , 2: 'right'}

# Prepare lists to collect true labels and predictions
all_predictions = []
all_labels = []

model.eval()  # Set model to evaluation mode

with torch.no_grad():  # Disable gradient calculation
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Append predictions and labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert numerical labels to class names
all_predictions = [class_mapping[prediction] for prediction in all_predictions]
all_labels = [class_mapping[label] for label in all_labels]

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

# Print metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Accuracy: 0.9760
Precision: 0.9762
Recall: 0.9760
F1 Score: 0.9760


In [67]:
def new_clean_text(text):
    # Join the list into a single string
    text = ''.join(text)
    
    # Remove any unwanted characters or patterns (e.g., special characters, multiple spaces)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    return text

In [105]:
# Example new text
new_text = ["""›BJP says Narendra Modi believes in 'simple living, high ideals', rejects Congress allegations
The Economic Times daily newspaper is available online now.
BJP says Narendra Modi believes in 'simple living, high ideals', rejects Congress allegations
SECTIONS
BJP says Narendra Modi believes in 'simple living, high ideals', rejects Congress allegations
PTI
Last Updated: Apr 16, 2019, 09:59:00 PM IST
Rate Story
Comment
Synopsis
The BJP said the Congress has levelled false allegations regarding Modi's property and counter-accused the Gandhi family of having acquired a lot of assets over years with little-known sources.
PTI
NEW DELHI: Asserting that Prime Minister Narendra Modi has often donated his personal savings to public causes, the BJP Tuesday said he has always believed in the principle of "simple living, high ideals", as the saffron party slammed the Congress for accusing that he had given "wrong information" about a plot he owned in Gandhinagar.
The BJP said the Congress has levelled false allegations regarding Modi's property and counter-accused the Gandhi family of having acquired a lot of assets over years with little-known sources.
The Congress has in the past dismissed the BJP's charge of financial irregularities against the Gandhi family as false and has also termed action by probe agencies as political vendetta.
At a press conference, the Congress alleged on Tuesday Modi had declared in an election affidavit in 2007 that he was the sole owner of Plot 411 in Sector 1 of Gandhinagar, but omitted that information in his election affidavits afterwards and gave details of another piece of land.
"In the light of discrepancies in the affidavit, the Congress party demands that the Election Commission of India takes thorough cognisance of the seemingly deliberate omissions in Modi's affidavit and take appropriate action under the Representation of People Act," Congress spokesperson Pawan Khera said.
Putting up a strong defence of Modi, the ruling party said his life is an example of believing in "simple living and high ideals".
The BJP said Modi recently donated Rs 21 lakh from his personal savings to the welfare fund of sanitary employees during the ' Kumbh Mela '.
In 2014, Modi had donated a similar amount of money from his savings for the education of daughters of Gujarat government employees after he quit as the state's chief minister to take up the prime minister's post, it said.
The BJP said that the prime minister had donated USD 200,000, which he received on being conferred with Seoul Peace Prize , to "Namami Gange" project, aimed at cleaning the river.
He had also donated Rs 11.73 crore after auctioning various gifts he had received as prime minister, the BJP said.
Refuting the Congress allegation regarding the Gandhinagar plot, the BJP said four plots purchased by four different persons, including Modi, were amalgamated into one on April 25, 2008 and added that it is obvious that the number of the amalgamated plot is different from the separate individual numbers.
"The caravan of falsehood moves on. There is a bankruptcy of issues in the Congress," it said.
A similar charge is made against Finance Minister Arun Jaitley, the BJP noted, saying, "there is a limit to trivialisation". Jaitley owned one of the four plots merged later.
(You can now subscribe to our Economic Times WhatsApp channel )
Read More News on
"""]
text = new_clean_text(new_text)

In [99]:
# Example new text
new_text1 = ["""In a significant move, Rahul Gandhi has accepted the Congress Working Committee’s decision to become the Leader of the Opposition in the Lok Sabha. This decision was made public on Tuesday, just hours before the commencement of the 18th Lok Sabha. This marks a crucial moment in Indian politics as the opposition, led by Congress, prepares to challenge the ruling BJP-led NDA coalition.

First Major Battle: Speaker Election
The first significant confrontation between the opposition and the NDA is set for the Speaker’s election. Traditionally, the Deputy Speaker position is given to an opposition member. However, lacking assurance on this tradition, Congress and the INDIA bloc have decided to contest the Speaker’s post. They have nominated K Suresh against the BJP’s candidate, Om Birla, who held the position in the previous Lok Sabha.Symbolic Yet Significant Contest
Although the election for the Speaker’s post might be largely symbolic, given the NDA’s simple majority with 293 MPs and additional support from the YSR Congress, it sends a strong message. The opposition aims to signal a shift in parliamentary dynamics compared to 2014 and 2019.

Empowering the Opposition
Appointing Rahul Gandhi, a prominent figure in Congress, as the Leader of the Opposition is a strategic move. This is the first time since 2014 that an opposition party has met the 54-seat threshold required to claim the post. This position, which comes with cabinet rank, will allow Gandhi to effectively raise issues on behalf of the opposition and the INDIA bloc. These issues include the alleged leaks in various examinations, such as NEET-UG, and the controversial Agnipath scheme for Army recruitment.

Role of the Leader of the Opposition
As the Leader of the Opposition, Rahul Gandhi will join key panels alongside the Prime Minister, such as those selecting Election Commissioners and the CBI director. This constitutional role also grants him the opportunity to engage with visiting heads of state, offering his perspective on national matters.

Announcement of the Decision
The INDIA bloc’s floor leaders finalized the decision to appoint Rahul Gandhi during a meeting at Congress President Mallikarjun Kharge’s residence. KC Venugopal, a senior party leader, announced that Sonia Gandhi, the Congress Parliamentary Party chairperson, had written to Pro-Tem Speaker Bhartruhari Mahtab, informing him of Rahul Gandhi’s appointment.

Congress Working Committee Resolution
The Congress Working Committee had resolved on June 8, shortly after the Lok Sabha election results, to appoint Rahul Gandhi as the Leader of the Opposition. The resolution praised his contributions to the party’s campaign, particularly through the Bharat Jodo Yatra and Bharat Jodo Nyay Yatra, which many saw as pivotal in reviving the party’s fortunes.

Reluctance and Acceptance
Although initially hesitant, Rahul Gandhi eventually accepted the role, encouraged by his mother Sonia Gandhi and sister Priyanka Gandhi. Congress President Kharge humorously warned him of disciplinary action if he did not comply with the Committee’s decision.

First Constitutional Role
This will be Rahul Gandhi’s first official constitutional role, despite his long tenure in Parliament since 2004. During the UPA rule from 2004 to 2014, party leaders urged him to take a Cabinet position, but he declined. Gandhi’s public image has evolved significantly, particularly following his extensive Bharat Jodo Yatra, which played a crucial role in boosting the Congress’s seat tally in the latest elections.

Impact on Congress and the Opposition
Rahul Gandhi’s appointment as the Leader of the Opposition will strengthen Congress’s position and the broader opposition alliance. The INDIA bloc, which includes major parties like the Samajwadi Party, Trinamool Congress, and DMK, aims to challenge the BJP’s dominance in the Lok Sabha.

This strategic positioning underscores the opposition’s intent to actively participate and influence parliamentary proceedings, reflecting a more robust and dynamic political landscape in India."""]

In [100]:
text = new_clean_text(new_text1)
text

'in a significant move rahul gandhi has accepted the congress working committees decision to become the leader of the opposition in the lok sabha this decision was made public on tuesday just hours before the commencement of the 18th lok sabha this marks a crucial moment in indian politics as the opposition led by congress prepares to challenge the ruling bjpled nda coalition\n\nfirst major battle speaker election\nthe first significant confrontation between the opposition and the nda is set for the speakers election traditionally the deputy speaker position is given to an opposition member however lacking assurance on this tradition congress and the india bloc have decided to contest the speakers post they have nominated k suresh against the bjps candidate om birla who held the position in the previous lok sabhasymbolic yet significant contest\nalthough the election for the speakers post might be largely symbolic given the ndas simple majority with 293 mps and additional support from 

In [106]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer
#model = BertForSequenceClassification.from_pretrained('/kaggle/working/saved_model')
#tokenizer = BertTokenizer.from_pretrained('/kaggle/working/saved_model')

# Move model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Tokenize the new text
encoding = tokenizer(
    text,
    padding=True,
    truncation=True,
    return_tensors='pt',
    max_length=512
)

# Move tensors to the appropriate device
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

# Define the class mapping
class_mapping = {0: 'center', 1: 'left', 2: 'right'}

# Make prediction
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map predictions to class names
predicted_labels = [class_mapping[prediction.item()] for prediction in predictions]

# Print predictions
print(predicted_labels)


['right']


In [None]:
# Print predictions
print(predicted_labels)


In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [72]:
# Save the model
model.save_pretrained('/kaggle/working/saved_model')

# Save the tokenizer
tokenizer.save_pretrained('/kaggle/working/saved_model')


('/kaggle/working/saved_model/tokenizer_config.json',
 '/kaggle/working/saved_model/special_tokens_map.json',
 '/kaggle/working/saved_model/vocab.txt',
 '/kaggle/working/saved_model/added_tokens.json')