In [None]:
# Install required libraries
!pip install torch torchvision torchaudio transformers scikit-learn pandas




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Sarcasm Headlines Dataset
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)

# Basic preprocessing
df = df[['headline', 'is_sarcastic']]
df.columns = ['text', 'label']  # Renaming for consistency
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Checking the first few rows
train_df.head()


Unnamed: 0,text,label
4724,"how trump really feels about queer people, exp...",0
26559,egyptian death sentence for soccer fans puts p...,0
24613,stripper failing school she's working self thr...,1
9048,texas attorney general ken paxton indicted,0
69,congresswoman fights for gun control because s...,0


In [None]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the data
def tokenize_data(data):
    return tokenizer(data['text'].tolist(),
                     padding=True,
                     truncation=True,
                     max_length=128,
                     return_tensors='pt')

train_encodings = tokenize_data(train_df)
test_encodings = tokenize_data(test_df)

# PyTorch dataset class
class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SarcasmDataset(train_encodings, train_df['label'].values)
test_dataset = SarcasmDataset(test_encodings, test_df['label'].values)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [None]:
from transformers import RobertaForSequenceClassification

# Load pre-trained model for sarcasm detection
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
from transformers import RobertaForSequenceClassification

# Load pre-trained model for sarcasm detection
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# Prepare DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    epoch_loss = 0
    for batch in tqdm(train_loader):
        batch = {key: val.to(device) for key, val in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 1431/1431 [04:48<00:00,  4.96it/s]


Average training loss: 0.2810
Epoch 2/3


100%|██████████| 1431/1431 [04:51<00:00,  4.90it/s]


Average training loss: 0.1368
Epoch 3/3


100%|██████████| 1431/1431 [04:52<00:00,  4.90it/s]

Average training loss: 0.0518





In [None]:
# Save the trained model and tokenizer
model.save_pretrained('sarcasm_detection_roberta')
tokenizer.save_pretrained('sarcasm_detection_roberta')


('sarcasm_detection_roberta/tokenizer_config.json',
 'sarcasm_detection_roberta/special_tokens_map.json',
 'sarcasm_detection_roberta/vocab.json',
 'sarcasm_detection_roberta/merges.txt',
 'sarcasm_detection_roberta/added_tokens.json')

In [None]:
# Load the trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('sarcasm_detection_roberta')
tokenizer = RobertaTokenizer.from_pretrained('sarcasm_detection_roberta')
model.to(device)
model.eval()

# Test with a sarcastic sentence
def classify_sarcasm(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    class_labels = ['Not Sarcastic', 'Sarcastic']
    return class_labels[predicted_class]

# Example sentence
sentence = "thirtysomething scientists unveil doomsday clock of hair loss"
prediction = classify_sarcasm(sentence)
print(f'Input Sentence: "{sentence}"')
print(f'Predicted Class: {prediction}')


Input Sentence: "thirtysomething scientists unveil doomsday clock of hair loss"
Predicted Class: Sarcastic


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Initialize variables for accuracy calculation
all_labels = []
all_predictions = []

for batch in tqdm(test_loader):
    batch = {key: val.to(device) for key, val in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    all_labels.extend(batch['labels'].cpu().numpy())
    all_predictions.extend(predictions.cpu().numpy())

# Calculate accuracy and print classification report
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Accuracy: {accuracy:.4f}')

# Detailed classification report
report = classification_report(all_labels, all_predictions, target_names=['Not Sarcastic', 'Sarcastic'])
print(report)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 358/358 [00:37<00:00,  9.53it/s]


Accuracy: 0.9310
               precision    recall  f1-score   support

Not Sarcastic       0.92      0.96      0.94      2995
    Sarcastic       0.95      0.90      0.93      2729

     accuracy                           0.93      5724
    macro avg       0.93      0.93      0.93      5724
 weighted avg       0.93      0.93      0.93      5724



In [None]:
!pip install Flask gunicorn


Collecting gunicorn
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Downloading gunicorn-23.0.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gunicorn
Successfully installed gunicorn-23.0.0


In [None]:
!pip install Flask
!pip install flask-ngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!pip install flask-ngrok
!pip install pyngrok




In [None]:
!ngrok authtoken 2ms5BfTAfGrFt7yiJC4BOanZ5sX_6DsXJWSQocEFZFmYC8UMN


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import os

# Create a 'templates' directory
os.makedirs('templates', exist_ok=True)


In [None]:
# Create the templates folder and save the HTML file
import os

# Create 'templates' directory
os.makedirs('templates', exist_ok=True)

# Save the HTML content into index.html
html_content = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sarcasm Detection</title>
    <style>
        body {
            font-family: 'Arial', sans-serif;
            background: linear-gradient(to right, #6a11cb, #2575fc);
            color: #ffffff;
            text-align: center;
            margin: 0;
            padding: 0;
            height: 100vh;
            display: flex;
            flex-direction: column;
            justify-content: center;
            align-items: center;
            animation: fadeIn 1s ease-in;
        }

        @keyframes fadeIn {
            from { opacity: 0; }
            to { opacity: 1; }
        }

        h1 {
            font-size: 2.5em;
            margin-bottom: 20px;
            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
            animation: slideIn 0.5s ease-in;
        }

        @keyframes slideIn {
            from { transform: translateY(-50px); opacity: 0; }
            to { transform: translateY(0); opacity: 1; }
        }

        form {
            background-color: rgba(255, 255, 255, 0.9);
            border-radius: 12px;
            padding: 30px;
            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
            width: 100%;
            max-width: 400px;
            margin: 0 auto;
            animation: bounceIn 0.8s ease-in-out;
        }

        @keyframes bounceIn {
            from { transform: scale(0.9); opacity: 0; }
            to { transform: scale(1); opacity: 1; }
        }

        input[type="text"] {
            width: calc(100% - 22px);
            padding: 12px;
            margin: 12px 0;
            border: 2px solid #007bff;
            border-radius: 5px;
            font-size: 18px;
            transition: border-color 0.3s ease;
        }

        input[type="text"]:focus {
            border-color: #0056b3;
            animation: pulse 0.5s infinite;
        }

        @keyframes pulse {
            0% { transform: scale(1); }
            50% { transform: scale(1.05); }
            100% { transform: scale(1); }
        }

        button {
            padding: 12px;
            margin-top: 10px;
            border-radius: 5px;
            border: none;
            background-color: #007bff;
            color: white;
            font-size: 18px;
            cursor: pointer;
            transition: background-color 0.3s ease, transform 0.3s ease;
            width: 100%;
        }

        button:hover {
            background-color: #0056b3;
            transform: translateY(-2px);
            animation: shake 0.5s ease-in-out;
        }

        @keyframes shake {
            0% { transform: translate(0); }
            25% { transform: translate(-2px, 2px); }
            50% { transform: translate(2px, -2px); }
            75% { transform: translate(-2px, -2px); }
            100% { transform: translate(0); }
        }

        .prediction-result {
            margin-top: 20px;
            padding: 20px;
            border-radius: 10px;
            background-color: rgba(40, 167, 69, 0.9);
            color: #fff;
            display: inline-block;
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
            animation: fadeIn 0.5s ease-in;
            border: 2px solid #28a745; /* Border color for the result */
            transition: transform 0.2s ease;
        }

        .prediction-result:hover {
            transform: scale(1.05); /* Scale effect on hover */
        }

        .input-text {
            font-weight: bold; /* Make input text bold */
            font-size: 1.2em; /* Increase font size */
        }
    </style>
</head>
<body>
    <h1>Sarcasm Detection System</h1>
    <form method="POST" action="/predict">
        <label for="input_text">Enter a sentence to check:</label><br>
        <input type="text" id="input_text" name="input_text" placeholder="Enter sentence" required><br>
        <button type="submit">Predict</button>
    </form>

    {% if prediction %}
    <div class="prediction-result">
        <h2>Prediction: {{ prediction }}</h2>
        <h3 class="input-text">Input Sentence: "{{ input_text }}"</h3>
    </div>
    {% endif %}
</body>
</html>
'''

# Write the HTML content to the file
with open('templates/index.html', 'w') as file:
    file.write(html_content)

print("HTML file created successfully!")


HTML file created successfully!


In [None]:
!pip install Flask pyngrok transformers torch




In [None]:
!python app.py

python3: can't open file '/content/app.py': [Errno 2] No such file or directory


In [None]:
from flask import Flask, render_template, request
from pyngrok import ngrok
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
# Initialize the Flask app
app = Flask(__name__)

# Load the pre-trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('sarcasm_detection_roberta')
tokenizer = RobertaTokenizer.from_pretrained('sarcasm_detection_roberta')

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

# Sarcasm detection function
def classify_sarcasm(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    class_labels = ['Not Sarcastic', 'Sarcastic']
    return class_labels[predicted_class]

# Define routes
@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    input_text = request.form['input_text']
    prediction = classify_sarcasm(input_text)
    return render_template('index.html', prediction=prediction, input_text=input_text)

# Start ngrok tunnel
public_url = ngrok.connect(5000)
print(f" * ngrok tunnel: {public_url}")

# Run the Flask app
if __name__ == '__main__':
    app.run()


 * ngrok tunnel: NgrokTunnel: "https://8700-34-143-164-252.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 14:52:42] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 14:52:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 14:52:53] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 14:55:32] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 14:55:41] "POST /predict HTTP/1.1" 200 -
