# __Generate Sentiments using RoBERTa__

The code aims to split movie reviews into smaller files, analyze and assign sentiments to each review individually, and subsequently merge the divided movie files into a single consolidated file.

- The goal is to reduce the impact of a single record failure, preventing it from affecting the entire set of records.
- By dividing records for each member, the code increases processing speed, optimizing the efficiency of the overall operation.

__Pre-requisites:__
- Get the assigned zipped file from https://mylambton.sharepoint.com/:f:/r/sites/NLPandSocialMediaAnalytics/Shared%20Documents/General/Split%20Dataset%20(Movie%20Reviews)?csf=1&web=1&e=1EaRaL
- Create a directory named 'split_dataset' within the 'dataset' folder. Transfer all files into the newly created 'split_dataset' directory.

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import contractions
import os
import text_preprocessing as tp

from nltk.corpus import stopwords
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
from functools import partial

#### Initialize folders

In [6]:
filename = 'reviews'
source_data_directory = "./dataset/"
split_dataset_directory = './dataset/split_dataset/'                # contains the split files
done_split_dataset_directory = './dataset/split_dataset/done/'      # contains split files that were done processing
sentiment_directory = './dataset/split_dataset/sentiment/'          # contains files with sentiments
done_sentiment_directory = './dataset/split_dataset/sentiment/done' # output directory of the combined file

#### Splitting the movie review records into several smaller files

In [3]:
def split_and_save_csv(input_file, output_prefix, chunk_size):
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Split the DataFrame into chunks
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    # Save each chunk to a separate CSV file
    for i, chunk in enumerate(chunks):
        output_file = f"{output_prefix}_{i + 1}.csv"
        chunk.to_csv(output_file, index=False)
        print(f"Saved {len(chunk)} records to {output_file}")


if not os.path.exists(split_dataset_directory):
    os.makedirs(split_dataset_directory)

input_csv = source_data_directory + filename + ".csv"
output_prefix = split_dataset_directory + filename   
chunk_size = 100     

split_and_save_csv(input_csv, output_prefix, chunk_size)


#### Assigning sentiment labels on each movie review record using RoBERTa

In [4]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

# Initialize the English stop words list
list_of_stopwords = set(stopwords.words('english'))


def data_preprocessing(text):
    text = tp.remove_email_address(text)
    text = tp.remove_hyperlink(text)
    text = tp.replace_whitespace(text)
    text = tp.remove_stopwords(text, list_of_stopwords)
    return text


def polarity_score_roberta(data):
    # Do basic data pre-processing
    data_preprocessing(data)

    # Specify the maximum sequence length
    max_length = 512  # Adjust this based on the model's maximum sequence length

    # Tokenize and truncate/pad the input text
    encoded_text = tokenizer(data, return_tensors='tf', max_length=max_length, truncation=True, padding=True)
    
    output = model(**encoded_text)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    scores_dict = {
        "roberta_neg": scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    
    return scores_dict

# Define the function to generate labels
def generate_roberta_labels(data, positive_threshold=0.5, negative_threshold=0.5):
    # Get roberta scores
    scores = polarity_score_roberta(data)
    roberta_neg, roberta_neu, roberta_pos = scores['roberta_neg'], scores['roberta_neu'], scores['roberta_pos']

    sentiment_results_dict = {'positive': roberta_pos, 
                              'negative': roberta_neg, 
                              'neutral': roberta_neu}

    highest_sentiment = max(sentiment_results_dict.items(), key=lambda x: x[1])

    if highest_sentiment[0] == 'positive':    
        if highest_sentiment[1] >= 0.8:        
            return "Strongly Positive" 
        else:        
            return "Positive"
    elif highest_sentiment[0] == 'negative':    
        if highest_sentiment[1] >= 0.8:        
            return "Strongly Negative"
        else:
            return "Negative"
    else:
        return "Neutral"


def process_dataframe(df):
    df['roberta_sentiment'] = df['review_detail'].apply(lambda x : generate_roberta_labels(x))
    return df


def parallel_processing(df, func, num_threads=3):
    # NOTE: Change the number of threads depending on device's CPU core
    # Split the DataFrame into chunks for parallel processing
    chunks = np.array_split(df, num_threads)

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(func, chunks))

    # Concatenate the results
    result_df = pd.concat(results, ignore_index=True)
    return result_df


def list_files(directory):
    files = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            files.append(filepath)
    return files
   

if __name__ == "__main__":

    files_in_directory = list_files(split_dataset_directory)

    # Create 'done' directory inside the 'split_dataset
    if not os.path.exists(done_split_dataset_directory):
        print("Creating 'done' directory...")
        os.makedirs(done_split_dataset_directory)

    # Create 'sentiment' directory inside the 'split_dataset
    if not os.path.exists(sentiment_directory):
        print("Creating 'sentiment' directory...")
        os.makedirs(sentiment_directory)

    for input_filepath in files_in_directory:

        try:
            base_filename = os.path.basename(input_filepath)
            no_ext_base_filename = os.path.splitext(base_filename)[0]
            output_filepath = sentiment_directory + no_ext_base_filename + "_sentiment.csv"

            print(f"Reading source the file: {input_filepath}")
            movie_reviews_df = pd.read_csv(input_filepath)
            
            print("Parallel processing of movie reviews...")
            movie_reviews_df = parallel_processing(movie_reviews_df, process_dataframe)

            # Save to csv file
            movie_reviews_df.to_csv(output_filepath, index=False)

            print(f"\nSuccessfully saved the file with sentiments in {output_filepath}")

            # Move the file that was processed to the done folder
            destination_file = os.path.join(done_split_dataset_directory, base_filename)
            os.rename(input_filepath, destination_file)
            
            print(f"\nMove source file to done folder: {destination_file}")
        except Exception as err:
            print(f"ERROR: {err}")
            print(f"File: {input_filepath}")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Reading source the file: ./dataset/split_dataset/reviews_4201.csv
Parallel processing of movie reviews...


#### Merging several movie review files into a single file

In [None]:
# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(sentiment_directory) if file.endswith('.csv')]

if not csv_files:
    print("No CSV files found in the specified directory.")
else:
    combined_data = pd.DataFrame()
    for file in csv_files:
        try:            
            file_path = os.path.join(sentiment_directory, file)
            df = pd.read_csv(file_path)
            combined_data = pd.concat([combined_data, df], ignore_index=True)

            # Move the file that was processed to the done folder
            destination_file = os.path.join(done_sentiment_directory, file)
            os.rename(file_path, destination_file)
            
            print(f"Move source file to done folder: {destination_file}")
        except Exception as err:
            print(f"ERROR: {err}")
            print(f"Filename: {file_path}")

    # Save the combined DataFrame to a new CSV file
    combined_data.to_csv(os.path.join(sentiment_directory, 'combined_reviews.csv'), index=False)

    print("CSV files combined successfully.")