In [None]:
import requests
import re
from goose3 import Goose
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import torch
import os
import shutil
import time
import pandas as pd

In [None]:
def overall(folder_path):

    # Loading the BERT T5 model and the sentiment analysis model
    summarize_tokenizer, summarizer_model = load_summarizing_model()
    # sentiment_tokenizer, sentiment_model = load_sentiment_analysis_model()

    json_files = get_json_files(folder_path)
    json_files.reverse()
    #json_files = json_files[2:]
    print(f"Found {len(json_files)} JSON files ")
    for file in json_files:
        #Trying reading the file
        try:
            df = pd.read_json(folder_path+"/"+ file)
            link = df['Link']
        except KeyError:
            print("The column 'Link' does not exist in the DataFrame.")
            continue

        for url in link:
            # Scrape the url
            long_text = scraping_web_page(url)

            # Summarize the text
            summary = long_text if len(long_text) < 470 else summarize_text(summarize_tokenizer, summarizer_model, long_text)

            # Compute the sentiment score
            score = computing_sentiment_score(sentiment_tokenizer, sentiment_model, summary)

            if score is None:
                score=0

            # Assign summary and sentiment score to respective rows
            df.loc[df['Link'] == url, "summary"] = summary
            df.loc[df['Link'] == url, "sentiment_score"] = score

        # Save the processed DataFrame to a JSON file
        df.to_json('/content/drive/MyDrive/Article_project/processed_with_summary/' + "summary_"+file, orient='records')
        move_file_to_nested_folder(folder_path+"/"+ file)
        print('')
        print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
        print(f"****Finished processing {file}*****")
        print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
        print('')

In [None]:
def computing_sentiment_score(tokenizer, model, text):
    try:
        tokens = tokenizer.encode(text, return_tensors='pt')
        result = model(tokens)
        score = int(torch.argmax(result.logits))+1
        return score
    except Exception as e:
        print('a problem occurs when computing the sentiment')
        return None
    except RuntimeError as e:
        print('A problem occurs when computing the sentiment:', e)
        return None


In [None]:
def load_sentiment_analysis_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
        model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
        return tokenizer, model
    except Exception as e:
        print('a problem occurs when loading the sentiment analysis model')
        return None

In [None]:
def add_label(folder_path):
    json_files = get_json_files(folder_path)
    print(f"Found {len(json_files)} JSON files ")

    my_list = [
        "An Errorrr happen wheb trying to reach the website",
        "Access denied Errorrr",
        "Network Errorrr. code 400",
        "Extraction Errorrr",
        ""
    ]

    for file in json_files:
        # Trying to read the file
        try:
            df = pd.read_json(os.path.join(folder_path, file))
            
            # Define a function to apply the labeling logic
            def label_row(row):
                if row['summary'] in my_list:
                    return 0
                else:
                    return 1

            # Apply the function to each row and create a new column 'label'
            df['label'] = df.apply(label_row, axis=1)

             # Save the DataFrame back to the JSON file
            #df.to_json(os.path.join('processed_with_label', file), orient='records', lines=True)

            # Save the processed DataFrame to a JSON file
            df.to_json('processed_with_label/' + "summary_"+file, orient='records')

            print('')
            print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
            print(f"****Finished processing {file}*****")
            print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
            print('')

        except Exception as e:
            print(f"An error happened: {e}")
            print(f"****ERRROR processing {file}*****")
            print('')
            continue

In [None]:
def add_recurrent_label(folder_path):
    json_files = get_json_files(folder_path)
    print(f"Found {len(json_files)} JSON files ")

    for file in json_files:
        # Trying to read the file
        
            df = pd.read_json(os.path.join(folder_path, file))

            df_filtered = df[df['label'] == 1]

            if df_filtered.empty:
                continue

            sentiment_score_counts = df_filtered['sentiment_score'].value_counts()

            most_common_score_sentiment = sentiment_score_counts.idxmax()
            least_common_score_sentiment = sentiment_score_counts.idxmin()

            df['most_recurrent_sentiment_score'] = most_common_score_sentiment
            df['least_recurrent_sentiment_score'] = least_common_score_sentiment

            # Save the processed DataFrame to a JSON file
            df.to_json('processed_with_min_max/' + "summary_"+file, orient='records')

            print('')
            print(f"****Finished processing {file}*****")
            print('')


In [None]:
def add_recurrent_summaries(folder_path, type_column="sentiment_score"):
    json_files = get_json_files(folder_path)
    #json_files = json_files[5:10]
    print(f"Found {len(json_files)} JSON files")
    
    for file in json_files:
        # Load JSON data into a DataFrame
        df = pd.read_json(folder_path + "/" + file)
        df_filtered = df[df["label"] != 0]

        # Check if the DataFrame is empty after filtering
        if df_filtered.empty:
            print(f"Skipping {file} as it is empty after filtering.")
            continue

        # Extract scores
        df_most_recurrent_score_value = df["most_recurrent_sentiment_score"].iloc[0]
        df_least_recurrent_score_value = df["least_recurrent_sentiment_score"].iloc[0]

        df_most_recurrent_score = df_filtered[df_filtered["sentiment_score"] == df_most_recurrent_score_value]
        df_least_recurrent_score = df_filtered[df_filtered["sentiment_score"] == df_least_recurrent_score_value]

        df_summary_most_recurrent_score = df_most_recurrent_score["summary"]
        df_summary_least_recurrent_score = df_least_recurrent_score["summary"]

        print(len(df_summary_least_recurrent_score))
        print(len(df_summary_most_recurrent_score))

        # Initialize summaries
        most_recurrent_summaries = ["" for _ in range(5)]
        least_recurrent_summaries = ["" for _ in range(5)]

        # Fill summaries
        for i in range(min(5, len(df_summary_most_recurrent_score))):
            most_recurrent_summaries[i] = ".".join(df_summary_most_recurrent_score.iloc[:i+1])

        for i in range(min(5, len(df_summary_least_recurrent_score))):
            least_recurrent_summaries[i] = ".".join(df_summary_least_recurrent_score.iloc[:i+1])

        # Add new columns with the most and least recurrent types
        for i in range(5):
            df[f'most_recurrent_summary_{i+1}'] = most_recurrent_summaries[i]
            df[f'least_recurrent_summary_{i+1}'] = least_recurrent_summaries[i]

        # Save the modified DataFrame back to JSON (or any other format)
        #df.to_json(f'processed_with_recurrent_summary/recurrent_summary_{file}', orient='records')
        df.to_json(f'processed_with_recurrent_summaries/{file}', orient='records')
        
        print('')
        print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
        print(f"****Finished processing {file}*****")
        print('µµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµµ***')
        print('')


In [None]:
def rename_files(folder_path, prefix_to_remove):
    # List all files in the folder
    files = os.listdir(folder_path)
    
    for file in files:
        # Check if the file name starts with the prefix to remove
        if file.startswith(prefix_to_remove):
            # Create the new file name by removing the prefix
            new_file_name = file[len(prefix_to_remove):]
            
            # Create the full path for the old and new file names
            old_file_path = os.path.join(folder_path, file)
            new_file_path = os.path.join(folder_path, new_file_name)
            
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed '{file}' to '{new_file_name}'")

In [None]:
def scraping_web_page(url):
    try:
        # Send a GET request to the URL
        try:
            response = requests.get(url)
        except:
            print("An Errorrr happen wheb trying to reach the website")
            return ("An Errorrr happen wheb trying to reach the website")

        # Check if access is denied (e.g., HTTP status code 403 or 401)
        if response.status_code == 403 or response.status_code == 401:
            print("Access denied. Website does not allow scraping.")
            return "Access denied Errorrr"
        elif response.status_code == 400:
            print("Network Errorrr. code 400")
            return ("Network Errorrr. code 400")
        elif response.status_code == 404:
            print("Network Errorrr. code 400")
            return ("Network Errorrr. code 400")

        config = {'strict': True, 'http_timeout': 360.0}
        g = Goose(config)
        article = g.extract(url=url)
        print("******success *****")
        return article.cleaned_text
    except Exception as e:
        print("An exception occurs:", e)
        return "Extraction Errorrr"
    

In [None]:
def load_summarizing_model():
    '''
    This function load the summarizer then return the tokenizer and the model
    '''
    try:
        tokenizer = AutoTokenizer.from_pretrained('t5-base')
        model = AutoModelForSeq2SeqLM.from_pretrained('t5-base', return_dict=True)
        return tokenizer, model
    except Exception as e:
        print('A problem happen when loading the summarizing model')
        return None

In [None]:
def computing_sentiment_score(tokenizer, model, text):
    try:
        tokens = tokenizer.encode(text, return_tensors='pt')
        result = model(tokens)
        score = int(torch.argmax(result.logits))+1
        return score
    except exception as e:
        print('a problem occurs when computing the sentiment')
        return None
    except RuntimeError as e:
        print('A problem occurs when computing the sentiment:', e)
        return None


In [None]:
def clean_text(text):
    # Remove special characters (except ".") and extra whitespaces
    cleaned_text = re.sub(r'[^\w\s.]', '', text)
    # Replace multiple consecutive whitespaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [None]:
def summarize_text(tokenizer, model, long_text):
    sequence = (long_text)
    inputs=tokenizer.encode("sumarize: " +sequence,return_tensors='pt', max_length=512, truncation=True)
    output = model.generate(inputs, min_length=300, max_length=512)

    # Decode the output without special tokens
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    # Clean up the summary
    cleaned_summary = clean_text(summary)
    return cleaned_summary

In [None]:
def get_json_files(folder):
    """
    Returns a list of all files ending with ".csv" in the current directory
    """
    files = os.listdir(folder) # Get a list of all files in the current directory
    json_files = [file for file in files if file.endswith(".json")] # filter to keep json file only
    return json_files


In [None]:

def move_file_to_nested_folder(file_path, target_dir="processed_json_file"):
    """
    Move a file to a nested folder within the current directory

    Args:
        file_path (str): The full path to the fil to be moved
        target_dir (str, optional): The name of the nested folder. Default to "processed_json_file"

    """
    nested_folder_path = os.path.join(os.getcwd(), target_dir)
    if not os.path.exists(nested_folder_path):
        os.makedirs(nested_folder_path)

    # Move the file to the nested folder
    shutil.move(file_path, nested_folder_path)