#Librerie e Pacchetti

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk import word_tokenize
import numpy as np
from google.colab import drive
import os
import json
import csv
import pandas as pd
from google.colab import files
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Carico il Dataset da Drive e lo faccio salvare in CSV

In [None]:
# Monta il Google Drive
drive.mount('/content/drive')

def search_file(name, path):
    found_files = []
    for root, dirs, files in os.walk(path):
        if name in files:
            found_files.append(os.path.join(root, name))
        # Stampa informazioni di debug
        print(f"Cercando in: {root}")
        print(f"File trovati in questa cartella: {files[:5]}...")  # Mostra solo i primi 5 file per brevità
    return found_files

# Parametri di ricerca
file_name = "Sarcasm_Headlines_Dataset.json"
drive_path = "/content/drive"

print(f"Inizio ricerca di '{file_name}' in {drive_path}")
print("Questo processo potrebbe richiedere alcuni minuti...")

found_files = search_file(file_name, drive_path)

if found_files:
    print(f"\nFile trovati ({len(found_files)}):")
    for file in found_files:
        print(file)

    # Verifica l'accesso al file
    print("\nVerifica accesso al file:")
    for file in found_files:
        try:
            with open(file, 'r') as f:
                print(f"Accesso riuscito a: {file}")
                print("Prime righe del file:")
                print(f.readline())
                print(f.readline())
        except Exception as e:
            print(f"Errore nell'accesso al file {file}: {str(e)}")
else:
    print(f"\nNessun file '{file_name}' trovato in {drive_path}")

# Informazioni aggiuntive
print("\nInformazioni sul sistema:")
print(f"Contenuto della directory corrente: {os.listdir('.')}")
print(f"Contenuto della root di Google Drive: {os.listdir('/content/drive/MyDrive')[:10]}...")  # Primi 10 elementi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Inizio ricerca di 'Sarcasm_Headlines_Dataset.json' in /content/drive
Questo processo potrebbe richiedere alcuni minuti...
Cercando in: /content/drive
File trovati in questa cartella: []...
Cercando in: /content/drive/.shortcut-targets-by-id
File trovati in questa cartella: []...
Cercando in: /content/drive/.shortcut-targets-by-id/1vgyWF6ogZ0n58LNovF6EPvxuUOzwsviK
File trovati in questa cartella: []...
Cercando in: /content/drive/.shortcut-targets-by-id/1vgyWF6ogZ0n58LNovF6EPvxuUOzwsviK/Progetto Deep Learning
File trovati in questa cartella: ['weekendUpdates.xlsx', 'Copia di Sentiment_analysis_in_lacrime', 'accuracy.png', 'model.png', 'Weekend_updates']...
Cercando in: /content/drive/.shortcut-targets-by-id/1vgyWF6ogZ0n58LNovF6EPvxuUOzwsviK/Progetto Deep Learning/.ipynb_checkpoints
File trovati in questa cartella: []...
Cercando in: /content/drive/.shortcut-ta

In [None]:
# Percorso del file JSON (sostituisci con il percorso corretto trovato dallo script precedente)
json_file_path = found_files[0]

# Funzione per leggere il file JSON riga per riga
def read_json_line_by_line(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                json_object = json.loads(line.strip())
                data.append(json_object)
            except json.JSONDecodeError:
                print(f"Errore nel decodificare la riga: {line}")
    return data

# Leggi i dati
print("Lettura del file JSON...")
data = read_json_line_by_line(json_file_path)

# Converti in DataFrame
print("Conversione in DataFrame...")
df = pd.DataFrame(data)

# Salva come CSV
csv_file_path = "/content/Sarcasm_Headlines_Dataset.csv"
df.to_csv(csv_file_path, index=False)
print(f"File CSV salvato in: {csv_file_path}")

# Mostra le prime righe del CSV
print("\nPrime 5 righe del CSV:")
print(df.head().to_string())

# Informazioni sul dataset
print("\nInformazioni sul dataset:")
print(df.info())


Lettura del file JSON...
Conversione in DataFrame...
File CSV salvato in: /content/Sarcasm_Headlines_Dataset.csv

Prime 5 righe del CSV:
   is_sarcastic                                                                         headline                                                                                 article_link
0             1                    thirtysomething scientists unveil doomsday clock of hair loss  https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205
1             0  dem rep. totally nails why congress is falling short on gender, racial equality    https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207
2             0                                eat your veggies: 9 deliciously different recipes                https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html
3             1                             inclement weather prevents liar from getting to work   http

# Analisi del Dataset

In [None]:
df = pd.read_csv("Sarcasm_Headlines_Dataset.csv")
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
len(df['headline'].values)

28619

In [None]:
df['is_sarcastic'].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,13634


# Preprocessing del Dataset



In [None]:
def remove_emojis(text):
    # Define a pattern to match emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese characters
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+", flags=re.UNICODE)

    # Replace emojis with an empty string
    text_without_emojis = emoji_pattern.sub(r'', text)

    return text_without_emojis

In [None]:
def remove_emails_hashtags_mentions(text):
    # Regular expression pattern to match emails, hashtags, and mentions
    pattern = re.compile(
        r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})|'  # Matches email addresses
        r'(#\w+)|'                                            # Matches hashtags
        r'(@\w+)'                                             # Matches mentions
    )

    # Replace matched patterns with an empty string
    text_without_emails_hashtags_mentions = pattern.sub('', text)

    return text_without_emails_hashtags_mentions

In [None]:
def preprocess_text(text):
    text = str(text)
    # Remove emails, hashtags, and mentions
    text = remove_emails_hashtags_mentions(text)

    # Remove emojis
    text = remove_emojis(text)

    # Replace non-alphabet characters with spaces and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
print(preprocess_text(df['headline'][0]))

thirtysomething scientists unveil doomsday clock of hair loss


In [None]:
df['cleaned_headlines'] = df['headline'].apply(lambda x: preprocess_text(x))

In [None]:
df.shape

(28619, 4)

In [None]:
def balance_df(df, text, target):
    ros = RandomOverSampler()
    train_x, train_y = ros.fit_resample(np.array(df[text]).reshape(-1,1), np.array(df[target]).reshape(-1,1))
    new_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = [text, target])

    return new_df


sarcasm_df = pd.DataFrame()
sarcasm_df = balance_df(df, 'cleaned_headlines', 'is_sarcastic')

In [None]:
X = sarcasm_df["cleaned_headlines"]
y = sarcasm_df['is_sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((23976,), (5994,), (23976,), (5994,))

# Modello RoBERTa

In [None]:
# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
max_length = max(len(text) for text in df['cleaned_headlines'])
max_length

911

In [None]:
train_encodings = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_length)

In [None]:
test_encodings = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_length)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset.shuffle(1000).batch(16),
    epochs=5,
    batch_size=8,
    validation_data=test_dataset.batch(16)
)

Epoch 1/5
  95/1499 [>.............................] - ETA: 14:28:59 - loss: 0.6548 - accuracy: 0.6224

In [None]:
y_probs = model.predict(test_dataset.batch(16))
y_pred = np.argmax(y_probs.logits, axis=1)

## Valutazione RoBERTa

In [None]:
# Assuming history contains the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title("Train and Validation Loss")
plt.legend()  # This will show the legend based on the labels specified above
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['Nor Sarcastic', 'Sarcastic'])
plt.yticks(tick_marks, ['Nor Sarcastic', 'Sarcastic'])

thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='black')

plt.xlabel("predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

In [None]:
accuracy = classification_report(y_test, y_pred)
print(accuracy)

## Salvataggio Modello RoBERTa

In [None]:
model.save_pretrained("roBERTa_sarcasm_detection")
tokenizer.save_pretrained("roBERTa_sarcasm_detection_tokenizer")