# Analyzing Trends in AI Research Publication:
# From Silver To Gold
---


# Prepare Environment

## Import Packages

In [None]:
try:
    # spaCy
    import os
    os.system('pip install nltk')
    os.system('python -m nltk.downloader stopwords')

    # If all is good, hide output or display success message
    print("Installation successful.")

except Exception as e:
    # Display the error
    print(f"Error:{str(e)}")

In [None]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F

from nltk.corpus import stopwords

### Arxiv Database in Hive Metastore

In [None]:
# Ensure the 'arxiv' database exists or create it
if not spark.catalog.databaseExists("arxiv"):
    spark.sql("CREATE DATABASE arxiv")

# Switch to the 'arxiv' database
spark.sql("USE arxiv")

## Constants

In [None]:
BASE_PATH = "/mnt/arxiv/"
INGESTION_PATH = "/mnt/arxiv/bronze/api"

## Functions

In [None]:
def load_latest_parquet():
    """
    Load the most recent Parquet file into a DataFrame
    """
    files = dbutils.fs.ls(INGESTION_PATH) 
    parquet_files = [f.name for f in files if f.name.endswith('.parquet')]
    sorted_files = sorted(parquet_files, reverse=True)
    latest_file = sorted_files[0]
    return spark.read.parquet(INGESTION_PATH + latest_file)


In [None]:
def delta_table_exists(layer, table_name):
    """
    Check if Delta table exists
    """
    table_path = f"{BASE_PATH}{layer}/delta/{table_name}/_delta_log/"
    try:
        dbutils.fs.ls(table_path) # Try to read 1 byte from the _delta_log directory
        return True
    except:
        return False


In [None]:
def create_or_update_delta(layer, table_name, data_source=None, 
                           join_on=["id", "last_update_date"], recreate=False):
    """
    Create, append, or recreate a Delta table in the specified layer, 
    register the table in the Hive metastore, and display the 
    first five rows of the Delta table.

    Args:
        layer (str): The layer (silver or gold) in which to create/append/recreate the Delta table.
        table_name (str): The name of the Delta table.
        data_source (DataFrame, optional): The Spark DataFrame to be loaded. 
                                           If None, the latest Parquet file from the ingestion path is used.
        join_on (list, optional): List of columns to join on when deduplicating data. Default is ["id", "last_update"].
        recreate (bool, optional): If True, drop and recreate the existing Delta table. Default is False.

    """
    delta_path = f"{BASE_PATH}{layer}/delta/{table_name}/"

    def load_data():
        if data_source is None:
            print("Loading data from the latest Parquet file...")
            return load_latest_parquet()
        else:
            print("Using provided DataFrame as data source...")
            return data_source  # Assuming data_source is a DataFrame

    if delta_table_exists(layer, table_name):
        print(f"The Delta table '{table_name}' already exists.")
        
        if recreate:
            print(f"Recreating the Delta table '{table_name}'...")
            
            # Drop the existing Delta table
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            
            # Remove the associated files of the Delta table
            dbutils.fs.rm(delta_path, recurse=True)
            
            # Create a new Delta table using the provided DataFrame's schema
            load_data().write.format("delta").mode("overwrite").save(delta_path)
            
            # Register the new Delta table in the Hive metastore
            spark.sql(f"""
            CREATE TABLE {table_name}
            USING DELTA 
            LOCATION '{delta_path}'
            """)
            
            print(f"The Delta table '{table_name}' has been recreated.")
        else:
            print(f"Appending new data to the existing Delta table '{table_name}'...")
            
            # Load new data
            new_data = load_data()
            
            # Load existing data
            existing_data = spark.read.format("delta").load(delta_path)

            # Deduplicate new data with existing data based on provided columns
            new_data = new_data.join(existing_data, join_on, "left_anti")

            # Append new data to Delta table
            new_data.write.format("delta").mode("append").save(delta_path)
            
            print(f"New data has been appended to the Delta table '{table_name}'.")
    else:
        print(f"The Delta table '{table_name}' does not exist. Creating a new table...")
        
        # This is the first run
        initial_data = load_data()
        
        # Create the Delta table
        initial_data.write.format("delta").mode("overwrite").save(delta_path)
        
        # Register the Delta table in the Hive metastore under 'arxiv' database
        spark.sql(f"""
        CREATE TABLE {table_name}
        USING DELTA 
        LOCATION '{delta_path}'
        """)
        
        print(f"The Delta table '{table_name}' has been created.")
    
    # Display the first five rows of the Delta table
    print("Displaying the first five rows of the Delta table...")
    display(spark.read.format("delta").load(delta_path))


# Gold Layer

## Create DataFrame from Preprocessed table

In [None]:
preprocessed_df = spark.table("preprocessed")
preprocessed_df = preprocessed_df.where(preprocessed_df.published_date >= "2010-01-01")
display(preprocessed_df)

## Group by Category

### Explode Categories

In [None]:
categories_exploded_df = preprocessed_df.select("*", F.explode(preprocessed_df.categories).alias("category"))

num_unique_categories = categories_exploded_df.select("category").distinct().count()

print(f"Numbers of unique categories in dataset: {num_unique_categories}")

display(categories_exploded_df)


### Filter by AI-related categories only

In [None]:
# Define the categories to be considered
selected_categories = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'cs.MA', 'cs.NE', 'cs.RO']

# Filter DataFrame based on the selected categories
categories_filtered_df = categories_exploded_df.where(F.col('category').isin(selected_categories))

display(categories_filtered_df)

### Number of publications by category

#### Unfiltered

In [None]:
publications_by_category_unfiltered_df = categories_exploded_df.groupBy("category").count().orderBy('count', ascending=False)

create_or_update_delta("gold", "publications_by_category_unfiltered", data_source=publications_by_category_unfiltered_df, join_on=["category"])

#### Filtered

In [None]:
# Mapping for category renaming
category_mapping = {
    'cs.AI': "Artificial Intelligence",
    'cs.CL': "Computational Linguistics",
    'cs.CV': "Computer Vision",
    'cs.LG': "Machine Learning",
    'cs.MA': "Multiagent Systems",
    'cs.NE': "Neural and Evolutionary Computing",
    'cs.RO': "Robotics"
}

# Construct the renaming logic
expr = F.col("category")
for arxiv_code, description in category_mapping.items():
    expr = F.when(F.col("category") == arxiv_code, description).otherwise(expr)

# Apply the renaming
categories_filtered_df = categories_filtered_df.withColumn("category", expr)

publications_by_category_filtered_df = categories_filtered_df.groupBy("category").count().orderBy('count', ascending=False)

create_or_update_delta("gold", "publications_by_category_filtered", data_source=publications_by_category_filtered_df, join_on=["category"])

## Group by Author

### Explode Authors

In [None]:
# Explode the authors column to create a new row for each author of each paper
authors_exploded_df = preprocessed_df.select("id", "categories", F.explode(preprocessed_df.authors).alias("author"))

num_unique_authors = authors_exploded_df.select("author").distinct().count()

print(f"Number of unique authors in dataset: {num_unique_authors}")

### Number of publications by author

In [None]:

# Group by the author and count the number of papers
publications_by_author_df = authors_exploded_df.groupBy("author").count().orderBy('count', ascending=False)

create_or_update_delta("gold", "publications_by_author", data_source=publications_by_author_df, join_on=["author"])

## Group by Publication Date

### Number of publications by date

In [None]:
# Group by the 'published_date' column and count the number of papers
publications_by_date_df = preprocessed_df.groupBy("published_date").count().orderBy("published_date")

create_or_update_delta("gold", "publications_by_date", data_source=publications_by_date_df, join_on=["published_date"])

### Number of publications by category by date

In [None]:
# Group by 'category' and 'published_date' columns and count the number of papers
publications_by_category_by_date_df = categories_filtered_df.groupBy('category', 'published_date').count().orderBy('published_date', 'category')

create_or_update_delta("gold", "publications_by_category_by_date", data_source=publications_by_category_by_date_df, join_on=["category", "published_date"])

## Text Analysis

### Word Clouds

In [None]:
# Create a set of English Stopwords
stop_words = set(stopwords.words('english'))

#### On Title

In [None]:
# Tokenize the titles and explode to create a row for each word
words_df = preprocessed_df.withColumn('word', F.explode(F.split(F.lower(F.col('title')), '\\W+')))  # split by non-word characters to avoid punctuation

# Filter out stopwords and words with length less than 2
filtered_words_df = words_df.filter(~F.col('word').isin(stop_words)).filter(F.length(F.col('word')) > 1)

# Compute word frequencies
word_freq_title_df = filtered_words_df.groupBy('word').count().orderBy('count', ascending=False)

create_or_update_delta("gold", "word_freq_title", data_source=word_freq_title_df, join_on=["word"])

#### On Summary

In [None]:
# Tokenize the summaries and explode to create a row for each word
words_df = preprocessed_df.withColumn('word', F.explode(F.split(F.lower(F.col('summary')), '\\W+')))  # split by non-word characters to avoid punctuation

# Filter out stopwords and words with length less than 2
filtered_words_df = words_df.filter(~F.col('word').isin(stop_words)).filter(F.length(F.col('word')) > 1)

# Compute word frequencies
word_freq_summary_df = filtered_words_df.groupBy('word').count().orderBy('count', ascending=False)

create_or_update_delta("gold", "word_freq_summary", data_source=word_freq_summary_df, join_on=["word"])