# RL

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from transformers import AutoTokenizer, AutoModel
from flask import Flask, render_template

In [2]:
import pandas as pd


file1_path = "E:/studies/project_ideatiaon/Mid Term/NDTV.csv"
file2_path = "E:/studies/project_ideatiaon/Mid Term/India Today.csv"

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Concatenate the DataFrames based on 'Title' and 'Article_Content'
df_combined = pd.concat([df1, df2], ignore_index=True)

# Saving the combined DataFrame to a new CSV file
combined_file_path = 'combined_data.csv'
df_combined.to_csv(combined_file_path, index=False)

df_combined_loaded = pd.read_csv(combined_file_path)

print(df_combined_loaded)


                                                Title  \
0   Delhi govt postpones odd-even plan as air qual...   
1       Parliament Winter Session to start from Dec 4   
2   Ministry of I&B approves Comprehensive “Digita...   
3   Addressing stubble burning: Indian Agricultura...   
4   Israel and Hamas negotiating for release of ci...   
5   UK’s ICE: Anusha Shah takes over as first Indi...   
6   Israel must protect Palestinians in West Bank ...   
7   इलेक्ट्रॉनिक वस्तुओं की गारंटी या वारंटी की अव...   
8   Defence minister directs swift release of thir...   
9   India's growth outlook soars: Fitch raises med...   
10  E-Summit 2023: NIT Raipur sparks innovation an...   
11  PAK vs ENG: Pakistan captain explains how they...   
12  ICC suspends Sri Lanka Cricket with immediate ...   
13  This day, that year: South Africa returns to i...   
14  'Are you doing anything?': Green Tribunal to P...   
15  76 years ago, Gandhi told India and Pakistan h...   
16  AUS vs BAN Live: BAN lose T

In [3]:
df_combined_loaded.head()

Unnamed: 0,Title,Article_Content
0,Delhi govt postpones odd-even plan as air qual...,“The AQI which was 450+ has now reached around...
1,Parliament Winter Session to start from Dec 4,
2,Ministry of I&B approves Comprehensive “Digita...,
3,Addressing stubble burning: Indian Agricultura...,
4,Israel and Hamas negotiating for release of ci...,


## TEXT PREOCESSING WITH  TRANSFOMER BASED NLP

In [4]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch

# Loading the pre-trained transformer model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Reading the combined CSV file into a Pandas DataFrame
combined_file_path = 'combined_data.csv'
df_combined = pd.read_csv(combined_file_path)

# Tokenize and encode news text using the transformer model
embeddings = []

for article_content in df_combined['Article_Content']:
    # Ensuring article content is a string
    article_content = str(article_content)
    
    # Tokenize and encode the text
    tokens = tokenizer(article_content, return_tensors='pt', max_length=512, truncation=True)
    
    # Forward pass to get embeddings
    with torch.no_grad():
        output = model(**tokens)
    
    cls_embedding = output['last_hidden_state'][:, 0, :].squeeze().detach().numpy()
    embeddings.append(cls_embedding)

df_combined['embeddings'] = embeddings

df_combined.to_csv('combined_data_with_embeddings.csv', index=False)

print(df_combined)

                                                Title  \
0   Delhi govt postpones odd-even plan as air qual...   
1       Parliament Winter Session to start from Dec 4   
2   Ministry of I&B approves Comprehensive “Digita...   
3   Addressing stubble burning: Indian Agricultura...   
4   Israel and Hamas negotiating for release of ci...   
5   UK’s ICE: Anusha Shah takes over as first Indi...   
6   Israel must protect Palestinians in West Bank ...   
7   इलेक्ट्रॉनिक वस्तुओं की गारंटी या वारंटी की अव...   
8   Defence minister directs swift release of thir...   
9   India's growth outlook soars: Fitch raises med...   
10  E-Summit 2023: NIT Raipur sparks innovation an...   
11  PAK vs ENG: Pakistan captain explains how they...   
12  ICC suspends Sri Lanka Cricket with immediate ...   
13  This day, that year: South Africa returns to i...   
14  'Are you doing anything?': Green Tribunal to P...   
15  76 years ago, Gandhi told India and Pakistan h...   
16  AUS vs BAN Live: BAN lose T

## Reinforcement Learning for Clustering

In [5]:
import numpy as np
import random


# Assuming we have 5 clusters for simplicity
num_clusters = 5

# State space: Each state corresponds to the embedding of a news article
state_space = df_combined['embeddings'].to_numpy()

# Action space: Each action corresponds to assigning an article to one of the clusters
action_space = list(range(num_clusters))

# Initialize Q-values for each state-action pair
Q = np.zeros((len(state_space), len(action_space)))

# Hyperparameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration-exploitation trade-off

# Training the Q-learning model

def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(action_space)  # exploration
    else:
        return np.argmax(Q[state])  # exploitation

# Training loop
num_episodes = 1000

for episode in range(num_episodes):
    state = random.randint(0, len(state_space) - 1)  # start with a random state

    while True:
        action = choose_action(state)
        
        # Simulate assigning the article to the chosen cluster (action)
        # Here, we'll update the Q-value based on the cosine similarity between embeddings
        next_state = random.randint(0, len(state_space) - 1)  # simulate moving to the next state
        reward = np.dot(state_space[state], state_space[next_state])  # cosine similarity as reward

        # Q-value update
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state

        if random.uniform(0, 1) < 0.1:  # a simple termination condition for illustration
            break

cluster_assignments = np.argmax(Q, axis=1)

# Adding the cluster assignments to the DataFrame
df_combined['cluster'] = cluster_assignments

# Saving the DataFrame with cluster assignments to a new CSV file
df_combined.to_csv('combined_data_with_clusters.csv', index=False)
print(df_combined)

                                                Title  \
0   Delhi govt postpones odd-even plan as air qual...   
1       Parliament Winter Session to start from Dec 4   
2   Ministry of I&B approves Comprehensive “Digita...   
3   Addressing stubble burning: Indian Agricultura...   
4   Israel and Hamas negotiating for release of ci...   
5   UK’s ICE: Anusha Shah takes over as first Indi...   
6   Israel must protect Palestinians in West Bank ...   
7   इलेक्ट्रॉनिक वस्तुओं की गारंटी या वारंटी की अव...   
8   Defence minister directs swift release of thir...   
9   India's growth outlook soars: Fitch raises med...   
10  E-Summit 2023: NIT Raipur sparks innovation an...   
11  PAK vs ENG: Pakistan captain explains how they...   
12  ICC suspends Sri Lanka Cricket with immediate ...   
13  This day, that year: South Africa returns to i...   
14  'Are you doing anything?': Green Tribunal to P...   
15  76 years ago, Gandhi told India and Pakistan h...   
16  AUS vs BAN Live: BAN lose T

## News Categorization

In [6]:
import pandas as pd

def categorize_news(cluster):
    # Define category mappings
    category_mapping = {
        0: 'General',
        1: 'Politics',
        2: 'Technology',
        3: 'Finance',
        4: 'Entertainment',
    }

    return category_mapping.get(cluster, 'Uncategorized')

df_combined['Category'] = df_combined['cluster'].apply(categorize_news)

print(df_combined[['Title', 'Article_Content', 'Category']])


                                                Title  \
0   Delhi govt postpones odd-even plan as air qual...   
1       Parliament Winter Session to start from Dec 4   
2   Ministry of I&B approves Comprehensive “Digita...   
3   Addressing stubble burning: Indian Agricultura...   
4   Israel and Hamas negotiating for release of ci...   
5   UK’s ICE: Anusha Shah takes over as first Indi...   
6   Israel must protect Palestinians in West Bank ...   
7   इलेक्ट्रॉनिक वस्तुओं की गारंटी या वारंटी की अव...   
8   Defence minister directs swift release of thir...   
9   India's growth outlook soars: Fitch raises med...   
10  E-Summit 2023: NIT Raipur sparks innovation an...   
11  PAK vs ENG: Pakistan captain explains how they...   
12  ICC suspends Sri Lanka Cricket with immediate ...   
13  This day, that year: South Africa returns to i...   
14  'Are you doing anything?': Green Tribunal to P...   
15  76 years ago, Gandhi told India and Pakistan h...   
16  AUS vs BAN Live: BAN lose T

In [7]:
# Saving the DataFrame with cluster assignments and categories to a new CSV file
df_combined.to_csv('final_categorized_data.csv', index=False)

print(df_combined[['Title', 'Article_Content', 'Category']])

                                                Title  \
0   Delhi govt postpones odd-even plan as air qual...   
1       Parliament Winter Session to start from Dec 4   
2   Ministry of I&B approves Comprehensive “Digita...   
3   Addressing stubble burning: Indian Agricultura...   
4   Israel and Hamas negotiating for release of ci...   
5   UK’s ICE: Anusha Shah takes over as first Indi...   
6   Israel must protect Palestinians in West Bank ...   
7   इलेक्ट्रॉनिक वस्तुओं की गारंटी या वारंटी की अव...   
8   Defence minister directs swift release of thir...   
9   India's growth outlook soars: Fitch raises med...   
10  E-Summit 2023: NIT Raipur sparks innovation an...   
11  PAK vs ENG: Pakistan captain explains how they...   
12  ICC suspends Sri Lanka Cricket with immediate ...   
13  This day, that year: South Africa returns to i...   
14  'Are you doing anything?': Green Tribunal to P...   
15  76 years ago, Gandhi told India and Pakistan h...   
16  AUS vs BAN Live: BAN lose T