# Analyzing Trends in AI Research Publication:
# Use Case Analysis
---


# Prepare Environment

## Import Packages

In [None]:
try:
    # spaCy
    import os
    os.system('pip install spacy')
    os.system('python -m spacy download en_core_web_sm')

    # Wordcloud
    os.system('pip install wordcloud')

    # NetworkX
    os.system('pip install networkx')

    # bokeh
    os.system('pip install bokeh')

    # Colorcet
    os.system('pip install colorcet')

    # If all is good, hide output or display success message
    print("Installation successful.")

except Exception as e:
    # Display the error
    print(f"Error:{str(e)}")

Installation successful.


In [None]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, udf
from pyspark.sql.types import ArrayType, StringType, IntegerType, FloatType, StructField, StructType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.clustering import LDA

from itertools import combinations

import pandas as pd
import numpy as np

from wordcloud import WordCloud
import spacy
import networkx as nx

import colorcet as cc
import matplotlib.colors
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from bokeh.embed import file_html 
from bokeh.models import Circle, MultiLine, Plot, Range1d, HoverTool, LinearColorMapper
from bokeh.models.graphs import NodesAndLinkedEdges
from bokeh.models.tools import WheelZoomTool, BoxZoomTool, ResetTool, PanTool
from bokeh.palettes import Greys256 as colors
from bokeh.plotting import from_networkx
from bokeh.resources import CDN

### Arxiv Database in Hive Metastore

In [None]:
# Ensure the 'arxiv' database exists or create it
if not spark.catalog.databaseExists("arxiv"):
    spark.sql("CREATE DATABASE arxiv")

# Switch to the 'arxiv' database
spark.sql("USE arxiv")

DataFrame[]

## Constants

In [None]:
BASE_PATH = "/mnt/arxiv/"
INGESTION_PATH = "/mnt/arxiv/bronze/api"

## Functions

In [None]:
def load_latest_parquet():
    """
    Load the most recent Parquet file into a DataFrame
    """
    files = dbutils.fs.ls(INGESTION_PATH) 
    parquet_files = [f.name for f in files if f.name.endswith('.parquet')]
    sorted_files = sorted(parquet_files, reverse=True)
    latest_file = sorted_files[0]
    return spark.read.parquet(INGESTION_PATH + latest_file)


In [None]:
def delta_table_exists(layer, table_name):
    """
    Check if Delta table exists
    """
    table_path = f"{BASE_PATH}{layer}/delta/{table_name}/_delta_log/"
    try:
        dbutils.fs.ls(table_path) # Try to read 1 byte from the _delta_log directory
        return True
    except:
        return False


In [None]:
def create_or_update_delta(layer, table_name, data_source=None, 
                           join_on=["id", "last_update_date"], recreate=False):
    """
    Create, append, or recreate a Delta table in the specified layer, 
    register the table in the Hive metastore, and display the 
    first five rows of the Delta table.

    Args:
        layer (str): The layer (silver or gold) in which to create/append/recreate the Delta table.
        table_name (str): The name of the Delta table.
        data_source (DataFrame, optional): The Spark DataFrame to be loaded. 
                                           If None, the latest Parquet file from the ingestion path is used.
        join_on (list, optional): List of columns to join on when deduplicating data. Default is ["id", "last_update_date"].
        recreate (bool, optional): If True, drop and recreate the existing Delta table. Default is False.

    """
    delta_path = f"{BASE_PATH}{layer}/delta/{table_name}/"

    def load_data():
        if data_source is None:
            print("Loading data from the latest Parquet file...")
            return load_latest_parquet()
        else:
            print("Using provided DataFrame as data source...")
            return data_source  # Assuming data_source is a DataFrame

    if delta_table_exists(layer, table_name):
        print(f"The Delta table '{table_name}' already exists.")
        
        if recreate:
            print(f"Recreating the Delta table '{table_name}'...")
            
            # Drop the existing Delta table
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            
            # Remove the associated files of the Delta table
            dbutils.fs.rm(delta_path, recurse=True)
            
            # Create a new Delta table using the provided DataFrame's schema
            load_data().write.format("delta").mode("overwrite").save(delta_path)
            
            # Register the new Delta table in the Hive metastore
            spark.sql(f"""
            CREATE TABLE {table_name}
            USING DELTA 
            LOCATION '{delta_path}'
            """)
            
            print(f"The Delta table '{table_name}' has been recreated.")
        else:
            print(f"Appending new data to the existing Delta table '{table_name}'...")
            
            # Load new data
            new_data = load_data()
            
            # Load existing data
            existing_data = spark.read.format("delta").load(delta_path)

            # Deduplicate new data with existing data based on provided columns
            new_data = new_data.join(existing_data, join_on, "left_anti")

            # Append new data to Delta table
            new_data.write.format("delta").mode("append").save(delta_path)
            
            print(f"New data has been appended to the Delta table '{table_name}'.")
    else:
        print(f"The Delta table '{table_name}' does not exist. Creating a new table...")
        
        # This is the first run
        initial_data = load_data()
        
        # Create the Delta table
        initial_data.write.format("delta").mode("overwrite").save(delta_path)
        
        # Register the Delta table in the Hive metastore under 'arxiv' database
        spark.sql(f"""
        CREATE TABLE {table_name}
        USING DELTA 
        LOCATION '{delta_path}'
        """)
        
        print(f"The Delta table '{table_name}' has been created.")
    
    # Display the first five rows of the Delta table
    print("Displaying the first five rows of the Delta table...")
    display(spark.read.format("delta").load(delta_path))


## Create DataFrame from Preprocessed table

In [None]:
preprocessed_df = spark.table("preprocessed")
preprocessed_df = preprocessed_df.where(preprocessed_df.published_date >= "2010-01-01")
display(preprocessed_df)

id,title,summary,authors,categories,published_date,published_time,last_update_date,last_update_time
704.0954,Sensor Networks with Random Links: Topology Design for Distributed  Consensus,"In a sensor network, in practice, the communication among sensors is subject to:(1) errors or failures at random times; (3) costs; and(2) constraints since sensors and networks operate under scarce resources, such as power, data rate, or communication. The signal-to-noise ratio (SNR) is usually a main factor in determining the probability of error (or of communication failure) in a link. These probabilities are then a proxy for the SNR under which the links operate. The paper studies the problem of designing the topology, i.e., assigning the probabilities of reliable communication among sensors (or of link failures) to maximize the rate of convergence of average consensus, when the link communication costs are taken into account, and there is an overall communication budget constraint. To consider this problem, we address a number of preliminary issues: (1) model the network as a random topology; (2) establish necessary and sufficient conditions for mean square sense (mss) and almost sure (a.s.) convergence of average consensus when network links fail; and, in particular, (3) show that a necessary and sufficient condition for both mss and a.s. convergence is for the algebraic connectivity of the mean graph describing the network topology to be strictly positive. With these results, we formulate topology design, subject to random link failures and to a communication cost constraint, as a constrained convex optimization problem to which we apply semidefinite programming techniques. We show by an extensive numerical study that the optimal design improves significantly the convergence speed of the consensus algorithm and can achieve the asymptotic performance of a non-random network at a fraction of the communication cost.","List(Soummya Kar, Jose M. F. Moura)","List(cs.IT, cs.LG, math.IT)",2007-04-06,21:58:52,2007-04-06,21:58:52
704.0985,Architecture for Pseudo Acausal Evolvable Embedded Systems,"Advances in semiconductor technology are contributing to the increasing complexity in the design of embedded systems. Architectures with novel techniques such as evolvable nature and autonomous behavior have engrossed lot of attention. This paper demonstrates conceptually evolvable embedded systems can be characterized basing on acausal nature. It is noted that in acausal systems, future input needs to be known, here we make a mechanism such that the system predicts the future inputs and exhibits pseudo acausal nature. An embedded system that uses theoretical framework of acausality is proposed. Our method aims at a novel architecture that features the hardware evolability and autonomous behavior alongside pseudo acausality. Various aspects of this architecture are discussed in detail along with the limitations.","List(Mohd Abubakr, R. M. Vinay)","List(cs.NE, cs.AI)",2007-04-07,13:40:49,2007-04-07,13:40:49
704.1198,A Doubly Distributed Genetic Algorithm for Network Coding,"We present a genetic algorithm which is distributed in two novel ways: along genotype and temporal axes. Our algorithm first distributes, for every member of the population, a subset of the genotype to each network node, rather than a subset of the population to each. This genotype distribution is shown to offer a significant gain in running time. Then, for efficient use of the computational resources in the network, our algorithm divides the candidate solutions into pipelined sets and thus the distribution is in the temporal domain, rather that in the spatial domain. This temporal distribution may lead to temporal inconsistency in selection and replacement, however our experiments yield better efficiency in terms of the time to convergence without incurring significant penalties.","List(Minkyu Kim, Varun Aggarwal, Una-May O'Reilly, Muriel Medard)","List(cs.NE, cs.NI)",2007-04-10,13:36:44,2007-04-10,13:36:44
704.1783,Unicast and Multicast Qos Routing with Soft Constraint Logic Programming,"We present a formal model to represent and solve the unicast/multicast routing problem in networks with Quality of Service (QoS) requirements. To attain this, first we translate the network adapting it to a weighted graph (unicast) or and-or graph (multicast), where the weight on a connector corresponds to the multidimensional cost of sending a packet on the related network link: each component of the weights vector represents a different QoS metric value (e.g. bandwidth, cost, delay, packet loss). The second step consists in writing this graph as a program in Soft Constraint Logic Programming (SCLP): the engine of this framework is then able to find the best paths/trees by optimizing their costs and solving the constraints imposed on them (e.g. delay < 40msec), thus finding a solution to QoS routing problems. Moreover, c-semiring structures are a convenient tool to model QoS metrics. At last, we provide an implementation of the framework over scale-free networks and we suggest how the performance can be improved.","List(Stefano Bistarelli, Ugo Montanari, Francesca Rossi, Francesco Santini)","List(cs.LO, cs.AI, cs.NI)",2007-04-13,15:53:44,2008-04-21,17:25:06
704.2092,A Note on the Inapproximability of Correlation Clustering,"We consider inapproximability of the correlation clustering problem defined as follows: Given a graph $G = (V,E)$ where each edge is labeled either ""+"" (similar) or ""-"" (dissimilar), correlation clustering seeks to partition the vertices into clusters so that the number of pairs correctly (resp. incorrectly) classified with respect to the labels is maximized (resp. minimized). The two complementary problems are called MaxAgree and MinDisagree, respectively, and have been studied on complete graphs, where every edge is labeled, and general graphs, where some edge might not have been labeled. Natural edge-weighted versions of both problems have been studied as well. Let S-MaxAgree denote the weighted problem where all weights are taken from set S, we show that S-MaxAgree with weights bounded by $O(|V|^{1/2-\delta})$ essentially belongs to the same hardness class in the following sense: if there is a polynomial time algorithm that approximates S-MaxAgree within a factor of $\lambda = O(\log{|V|})$ with high probability, then for any choice of S', S'-MaxAgree can be approximated in polynomial time within a factor of $(\lambda + \epsilon)$, where $\epsilon > 0$ can be arbitrarily small, with high probability. A similar statement also holds for $S-MinDisagree. This result implies it is hard (assuming $NP \neq RP$) to approximate unweighted MaxAgree within a factor of $80/79-\epsilon$, improving upon a previous known factor of $116/115-\epsilon$ by Charikar et. al. \cite{Chari05}.",List(Jinsong Tan),"List(cs.LG, cs.DS)",2007-04-17,03:52:41,2009-03-23,03:22:02
704.2644,Joint universal lossy coding and identification of stationary mixing  sources,"The problem of joint universal source coding and modeling, treated in the context of lossless codes by Rissanen, was recently generalized to fixed-rate lossy coding of finitely parametrized continuous-alphabet i.i.d. sources. We extend these results to variable-rate lossy block coding of stationary ergodic sources and show that, for bounded metric distortion measures, any finitely parametrized family of stationary sources satisfying suitable mixing, smoothness and Vapnik-Chervonenkis learnability conditions admits universal schemes for joint lossy source coding and identification. We also give several explicit examples of parametric sources satisfying the regularity conditions.",List(Maxim Raginsky),"List(cs.IT, cs.LG, math.IT)",2007-04-20,01:25:22,2007-04-20,01:25:22
704.2668,Supervised Feature Selection via Dependence Estimation,"We introduce a framework for filtering features that employs the Hilbert-Schmidt Independence Criterion (HSIC) as a measure of dependence between the features and the labels. The key idea is that good features should maximise such dependence. Feature selection for various supervised learning problems (including classification and regression) is unified under this framework, and the solutions can be approximated using a backward-elimination algorithm. We demonstrate the usefulness of our method on both artificial and real world datasets.","List(Le Song, Alex Smola, Arthur Gretton, Karsten Borgwardt, Justin Bedo)",List(cs.LG),2007-04-20,08:26:29,2007-04-20,08:26:29
704.2725,Exploiting Heavy Tails in Training Times of Multilayer Perceptrons: A  Case Study with the UCI Thyroid Disease Database,"The random initialization of weights of a multilayer perceptron makes it possible to model its training process as a Las Vegas algorithm, i.e. a randomized algorithm which stops when some required training error is obtained, and whose execution time is a random variable. This modeling is used to perform a case study on a well-known pattern recognition benchmark: the UCI Thyroid Disease Database. Empirical evidence is presented of the training time probability distribution exhibiting a heavy tail behavior, meaning a big probability mass of long executions. This fact is exploited to reduce the training time cost by applying two simple restart strategies. The first assumes full knowledge of the distribution yielding a 40% cut down in expected time with respect to the training without restarts. The second, assumes null knowledge, yielding a reduction ranging from 9% to 23%.","List(Manuel Cebrian, Ivan Cantador)",List(cs.NE),2007-04-20,15:58:04,2007-12-07,03:06:49
704.3395,General-Purpose Computing on a Semantic Network Substrate,"This article presents a model of general-purpose computing on a semantic network substrate. The concepts presented are applicable to any semantic network representation. However, due to the standards and technological infrastructure devoted to the Semantic Web effort, this article is presented from this point of view. In the proposed model of computing, the application programming interface, the run-time program, and the state of the computing virtual machine are all represented in the Resource Description Framework (RDF). The implementation of the concepts presented provides a practical computing paradigm that leverages the highly-distributed and standardized representational-layer of the Semantic Web.",List(Marko A. Rodriguez),"List(cs.AI, cs.PL)",2007-04-25,15:37:52,2010-06-06,05:29:22
704.3662,An Automated Evaluation Metric for Chinese Text Entry,"In this paper, we propose an automated evaluation metric for text entry. We also consider possible improvements to existing text entry evaluation metrics, such as the minimum string distance error rate, keystrokes per character, cost per correction, and a unified approach proposed by MacKenzie, so they can accommodate the special characteristics of Chinese text. Current methods lack an integrated concern about both typing speed and accuracy for Chinese text entry evaluation. Our goal is to remove the bias that arises due to human factors. First, we propose a new metric, called the correction penalty (P), based on Fitts' law and Hick's law. Next, we transform it into the approximate amortized cost (AAC) of information theory. An analysis of the AAC of Chinese text input methods with different context lengths is also presented.","List(Mike Tian-Jian Jiang, James Zhan, Jaimie Lin, Jerry Lin, Wen-Lien Hsu)","List(cs.HC, cs.CL)",2007-04-27,05:34:10,2007-04-27,05:34:10


# Authors Collaboration Analysis

In [None]:
# Step 1: Explode the authors column to create a new row for each author of each paper
authors_exploded_df = preprocessed_df.select("id", "authors", F.explode(F.col("authors")).alias("author"))

# Step 2: Identify prolific authors
prolific_authors_df = authors_exploded_df.groupBy('author').agg(F.count('*').alias('num_papers')).filter('num_papers > 30')

# Step 3: Generate all possible co-author pairs for each publication
def generate_pairs(authors_list):
    return [sorted(list(pair)) for pair in combinations(authors_list, 2)]

pairs_udf = F.udf(generate_pairs, ArrayType(ArrayType(StringType())))
pairs_df = preprocessed_df.withColumn("pairs", pairs_udf(F.col("authors")))

# Step 4: Explode pairs and then count collaborations
author_pairs_df = pairs_df.select(F.explode(F.col("pairs")).alias('pair'))
author_collaborations_df = author_pairs_df.groupBy('pair').agg(F.count('*').alias('num_collaborations'))

# Step 5: Filter for prolific pairs using a join
prolific_authors_collaborations_df = author_collaborations_df.alias("collabs") \
    .join(F.broadcast(prolific_authors_df.alias("prolific1")), F.col("collabs.pair")[0] == F.col("prolific1.author"), "inner") \
    .join(F.broadcast(prolific_authors_df.alias("prolific2")), F.col("collabs.pair")[1] == F.col("prolific2.author"), "inner") \
    .select("collabs.*") \
    .filter(F.col('num_collaborations') > 5)

# Step 6: Splitting the 'pair' column into 'author1' and 'author2'
prolific_authors_collaborations_df = prolific_authors_collaborations_df.withColumn('author1', F.col('pair')[0]) \
                                                       .withColumn('author2', F.col('pair')[1])

# Step 7: Add columns for the number of papers of author1 and author2
prolific_authors_collaborations_df = prolific_authors_collaborations_df \
    .join(prolific_authors_df.alias("pa1"), F.col("author1") == F.col("pa1.author"), "left") \
    .join(prolific_authors_df.alias("pa2"), F.col("author2") == F.col("pa2.author"), "left") \
    .select("pair", "author1", "author2", "num_collaborations", 
            F.col("pa1.num_papers").alias("num_papers_author1"), 
            F.col("pa2.num_papers").alias("num_papers_author2"))

create_or_update_delta("gold", "prolific_authors_collaborations", data_source=prolific_authors_collaborations_df, join_on=["pair"])

The Delta table 'prolific_authors_collaborations' does not exist. Creating a new table...
Using provided DataFrame as data source...
The Delta table 'prolific_authors_collaborations' has been created.
Displaying the first five rows of the Delta table...


pair,author1,author2,num_collaborations,num_papers_author1,num_papers_author2
"List(Ivan Laptev, Josef Sivic)",Ivan Laptev,Josef Sivic,28,81,58
"List(Liang Lin, Ruimao Zhang)",Liang Lin,Ruimao Zhang,12,242,55
"List(Bernt Schiele, Thomas Brox)",Bernt Schiele,Thomas Brox,6,178,109
"List(Baolin Peng, Jianfeng Gao)",Baolin Peng,Jianfeng Gao,35,49,263
"List(Jianfeng Gao, Sungjin Lee)",Jianfeng Gao,Sungjin Lee,7,263,36
"List(Ning Xu, Scott Cohen)",Ning Xu,Scott Cohen,7,75,33
"List(Manolis Savva, Thomas Funkhouser)",Manolis Savva,Thomas Funkhouser,10,45,62
"List(Chunhua Shen, Wei Liu)",Chunhua Shen,Wei Liu,6,355,344
"List(Michael W. Mahoney, Zhewei Yao)",Michael W. Mahoney,Zhewei Yao,21,126,43
"List(Kurt Keutzer, Michael W. Mahoney)",Kurt Keutzer,Michael W. Mahoney,19,127,126


In [None]:
df1 = prolific_authors_collaborations_df.select(F.col("author1").alias("author"))
df2 = prolific_authors_collaborations_df.select(F.col("author2").alias("author"))
combined_df = df1.union(df2)
author_collab_count = combined_df.groupBy("author").agg(F.count("*").alias("collab_count"))
authors_multi_collab_df = author_collab_count.filter(F.col("collab_count") > 5)
prolifics_authors_filtered_df = prolific_authors_df.join(authors_multi_collab_df, "author", "inner")

create_or_update_delta("gold", "prolific_authors", data_source=prolifics_authors_filtered_df, join_on=["author"])

## Graph visualization

In [None]:
collaborations_filtered_df = prolific_authors_collaborations_df.join(prolifics_authors_filtered_df.withColumnRenamed("author", "author1")
                                                                .select("author1"), "author1", "inner") \
                                                                .join(prolifics_authors_filtered_df.withColumnRenamed("author", "author2")
                                                                .select("author2"), "author2", "inner")

G = nx.Graph()
for row in collaborations_filtered_df.rdd.collect():
    G.add_edge(row.author1, row.author2, weight=row.num_collaborations)
for row in prolifics_authors_filtered_df.rdd.collect():
    if row.author in G.nodes:
        G.nodes[row.author]['num_papers'] = row.num_papers

author_to_id = {name: i for i, name in enumerate(G.nodes())}
id_to_author = {i: name for name, i in author_to_id.items()}
G = nx.relabel_nodes(G, author_to_id)

degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')

min_degree = min(degree_dict.values())
max_degree = max(degree_dict.values())
min_papers = min(G.nodes[node_id]['num_papers'] for node_id in G.nodes)
max_papers = max(G.nodes[node_id]['num_papers'] for node_id in G.nodes)

graph_renderer = from_networkx(G, nx.spring_layout, scale=1.1, center=(0,0), k=0.4, iterations=100)

edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
edge_weights = [max(w, 1) for w in edge_weights]
log_edge_weights = np.log(edge_weights)
min_log_weight = np.min(log_edge_weights)
max_log_weight = np.max(log_edge_weights)
normalized_log_weights = [(20 * (w - min_log_weight) / (max_log_weight - min_log_weight) + 5) for w in log_edge_weights]

node_palette = [matplotlib.colors.to_hex(c) for c in cc.cm.blues(np.linspace(0, 1, 256))]
edge_palette = [matplotlib.colors.to_hex(c) for c in cc.cm.CET_L19(np.linspace(0, 1, 256))]
colors = colors[::-1]
node_mapper = LinearColorMapper(palette=node_palette, low=min_degree, high=max_degree)
edge_mapper = LinearColorMapper(palette=colors, low=np.min(normalized_log_weights), high=np.max(normalized_log_weights))

node_data = graph_renderer.node_renderer.data_source.data
node_data['author_name'] = [id_to_author[node_id] for node_id in node_data['index']]
node_data['num_papers'] = [G.nodes[node_id]['num_papers'] for node_id in node_data['index']]
node_data['node_size'] = [(20 * (num_papers - min_papers) / (max_papers - min_papers) + 5) for num_papers in node_data['num_papers']]
node_data['degree'] = [G.nodes[node_id]['degree'] for node_id in node_data['index']]

edge_data = graph_renderer.edge_renderer.data_source.data
edge_data['collaborators'] = [f"{id_to_author[u]} & {id_to_author[v]}" for u, v in G.edges()]
edge_data['num_collaborations'] = edge_weights

graph_renderer.edge_renderer.data_source.data['normalized_log_weights'] = normalized_log_weights

graph_renderer.node_renderer.glyph = Circle(size='node_size', fill_color={'field': 'degree', 'transform': node_mapper})
graph_renderer.edge_renderer.glyph = MultiLine(line_color={'field': 'normalized_log_weights', 'transform': edge_mapper}, line_alpha=0.8, line_width=2)

graph_renderer.node_renderer.hover_glyph = Circle(size=5, fill_color='#fdca40')
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color='#fdca40', line_width=4)

graph_renderer.inspection_policy = NodesAndLinkedEdges()

plot = Plot(width=800, height=800, x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))

node_hover_tool = HoverTool(tooltips=[("Author", "@author_name"), 
                                      ("Number of Papers", "@num_papers"),
                                      ("Number of Collaborations", "@degree")],
                           renderers=[graph_renderer.node_renderer])
                           
edge_hover_tool = HoverTool(tooltips=[("Collaborators", "@collaborators"),
                                      ("Number of Collaborations", "@num_collaborations")],
                           renderers=[graph_renderer.edge_renderer])

plot.add_tools(node_hover_tool, edge_hover_tool, BoxZoomTool(), WheelZoomTool(), PanTool(), ResetTool())
plot.toolbar.active_inspect = [node_hover_tool]

plot.renderers.append(graph_renderer)

html = file_html(plot, CDN, "My interactive plot")
displayHTML(html)

# Advanced Text Analysis

## Lemmatized Word Clouds

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Using a Pandas UDF for lemmatization
@pandas_udf(ArrayType(StringType()))
def lemmatize_udf(tokens: pd.Series) -> pd.Series:
    # Process tokens as a batch
    lemmatized_tokens = []
    for token_list in tokens:
        doc = nlp(" ".join(token_list))  # Constructing a doc from tokens
        lemmatized_tokens.append([token.lemma_ for token in doc if len(token.lemma_) > 1])
    return pd.Series(lemmatized_tokens)

### On Title

In [None]:
# Use Spark's Tokenizer and StopWordsRemover for title
tokenizer_title = Tokenizer(inputCol="title", outputCol="raw_tokens_title")
stopwords_remover_title = StopWordsRemover(inputCol="raw_tokens_title", outputCol="filtered_tokens_title")

# Apply the transformations
tokenized_title_df = tokenizer_title.transform(preprocessed_df)
filtered_title_df = stopwords_remover_title.transform(tokenized_title_df)

# Apply Lemmatization using the UDF
lemmatized_title_df = filtered_title_df.withColumn('lemmatized_title', lemmatize_udf(F.col('filtered_tokens_title')))

# Chain DataFrame transformations for further processing, filter words with length >= 2, and limit to top 10,000 words
lemmatized_word_freq_title_df = (lemmatized_title_df
                                .withColumn('word', F.explode(F.col('lemmatized_title')))
                                .filter((F.length(F.col('word')) >= 2) & 
                                        (F.col('word').rlike("^[a-zA-Z0-9-]+$")))
                                .groupBy('word').count()
                                .orderBy('count', ascending=False)
                                .limit(1000))

# Save to Delta Lake
create_or_update_delta("gold", "lemmatized_word_freq_title", data_source=lemmatized_word_freq_title_df, join_on=["word"])

display(lemmatized_word_freq_title_df)


### On Summary

In [None]:
# Use Spark's Tokenizer and StopWordsRemover for summary
tokenizer_summary = Tokenizer(inputCol="summary", outputCol="raw_tokens_summary")
stopwords_remover_summary = StopWordsRemover(inputCol="raw_tokens_summary", outputCol="filtered_tokens_summary")

# Apply the transformations
tokenized_summary_df = tokenizer_summary.transform(preprocessed_df)
filtered_summary_df = stopwords_remover_summary.transform(tokenized_summary_df)

# Apply Lemmatization using the UDF
lemmatized_summary_df = filtered_summary_df.withColumn('lemmatized_summary', lemmatize_udf(F.col('filtered_tokens_summary')))

# Chain DataFrame transformations for further processing, filter words with length >= 2, and limit to top 10,000 words
lemmatized_word_freq_summary_df = (lemmatized_summary_df
                                  .withColumn('word', F.explode(F.col('lemmatized_summary')))
                                  .filter((F.length(F.col('word')) >= 2) & 
                                          (F.col('word').rlike("^[a-zA-Z0-9-]+$")))
                                  .groupBy('word').count()
                                  .orderBy('count', ascending=False)
                                  .limit(1000))

# Save to Delta Lake
create_or_update_delta("gold", "lemmatized_word_freq_summary", data_source=lemmatized_word_freq_summary_df, join_on=["word"])

display(lemmatized_word_freq_summary_df)


# Topic Modelling (Categorization)

## LDA Analysis

### On Title

In [None]:
# Step 1 - Text Preprocessing

## Remove unwanted characters
LDA_df = preprocessed_df.withColumn("title", F.regexp_replace(F.col("title"), "[^a-zA-Z\s]", ""))

## Convert titles to tokens
tokenizer = Tokenizer(inputCol="title", outputCol="tokens")
df_tokens = tokenizer.transform(LDA_df)

## Remove stop words
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df_tokens_filtered = remover.transform(df_tokens)

## Remove empty strings
df_tokens_filtered = df_tokens_filtered.withColumn("filtered_tokens", F.array_remove(df_tokens_filtered.filtered_tokens, ''))

In [None]:
# Step 2 - Vectorization

## Create a CountVectorizer, which creates a vocabulary then converts text documents into vectors of token counts
cv = CountVectorizer(inputCol="filtered_tokens", outputCol="raw_features", vocabSize=20000, minDF=2)
cv_model = cv.fit(df_tokens_filtered)
df_features = cv_model.transform(df_tokens_filtered)

## Use IDF to get the final features
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df_features)
df_features = idf_model.transform(df_features)


In [None]:
# Step 3 - Topic Modeling

## Train LDA model
num_topics = 20
lda = LDA(k=num_topics, maxIter=100, seed=1)
lda_model = lda.fit(df_features)

# Get the topics
topics = lda_model.describeTopics(10)

# Get the vocabulary
vocab = cv_model.vocabulary


In [None]:
# Step 4 - Display Topics

## Function to convert term indices to words
def indices_to_terms(vocab):
    def indices_to_terms_(termIndices):
        return [vocab[index] for index in termIndices]
    return F.udf(indices_to_terms_, ArrayType(StringType()))

## Convert term indices to words
topics = topics.withColumn("terms", indices_to_terms(vocab)(F.col("termIndices")))

## Function to sort termIndices, termWeights and terms
def sort_terms_weights(termIndices, termWeights, terms):
    term_weight_pairs = sorted(zip(termIndices, termWeights, terms), key=lambda x: x[1], reverse=True)
    return [list(t) for t in zip(*term_weight_pairs)]

sort_terms_weights_udf = F.udf(sort_terms_weights, ArrayType(ArrayType(StringType())))

## Sort termIndices, termWeights and terms
topics = topics.withColumn("sorted_terms_weights", sort_terms_weights_udf(F.col("termIndices"), F.col("termWeights"), F.col("terms")))

topics = topics.withColumn("termIndices", F.col("sorted_terms_weights")[0])\
               .withColumn("termWeights", F.col("sorted_terms_weights")[1])\
               .withColumn("terms", F.col("sorted_terms_weights")[2])\
               .drop("sorted_terms_weights")

topics.show(50, truncate=False)

In [None]:
# Step 5 - Categorize Papers
transformed = lda_model.transform(df_features)

# Define UDF to extract the topic with the highest weight
argmax_udf = F.udf(lambda x: int(np.argmax(x)), IntegerType())

# Apply UDF to the topic distribution column
labeled = transformed.withColumn('topic', argmax_udf('topicDistribution'))
labeled.groupBy('topic').count().orderBy('count', ascending=False).show()

In [None]:
# Step 6 - Visualize

## Convert to Pandas DataFrame for easier plotting
topic_counts = labeled.groupBy('topic').count().toPandas()

## Bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x='topic', y='count', data=topic_counts)
plt.title('Number of Papers by Topic')
plt.tight_layout()
plt.show()

### On Summary

In [None]:
# Step 1 - Text Preprocessing

## Remove unwanted characters
LDA_summary_df = preprocessed_df.withColumn("summary", F.regexp_replace(F.col("summary"), "[^a-zA-Z\s]", ""))

## Convert summaries to tokens
tokenizer = Tokenizer(inputCol="summary", outputCol="tokens")
df_tokens = tokenizer.transform(LDA_summary_df)

## Remove stop words
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df_tokens_filtered = remover.transform(df_tokens)

## Remove empty strings
df_tokens_filtered = df_tokens_filtered.withColumn("filtered_tokens", F.array_remove(df_tokens_filtered.filtered_tokens, ''))


# Step 2 - Vectorization

## Create a CountVectorizer, which creates a vocabulary then converts text documents into vectors of token counts
cv = CountVectorizer(inputCol="filtered_tokens", outputCol="raw_features", vocabSize=20000, minDF=2)
cv_model = cv.fit(df_tokens_filtered)
df_features = cv_model.transform(df_tokens_filtered)

## Use IDF to get the final features
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df_features)
df_features = idf_model.transform(df_features)


# Step 3 - Topic Modeling

## Train LDA model
num_topics = 20
lda = LDA(k=num_topics, maxIter=100, seed=1)
lda_model = lda.fit(df_features)

# Get the topics
topics = lda_model.describeTopics(10)

# Get the vocabulary
vocab = cv_model.vocabulary

# Step 4 - Display Topics

## Function to convert term indices to words
def indices_to_terms(vocab):
    def indices_to_terms_(termIndices):
        return [vocab[index] for index in termIndices]
    return F.udf(indices_to_terms_, ArrayType(StringType()))

## Convert term indices to words
topics = topics.withColumn("terms", indices_to_terms(vocab)(F.col("termIndices")))

## Function to sort termIndices, termWeights and terms
def sort_terms_weights(termIndices, termWeights, terms):
    term_weight_pairs = sorted(zip(termIndices, termWeights, terms), key=lambda x: x[1], reverse=True)
    return [list(t) for t in zip(*term_weight_pairs)]

sort_terms_weights_udf = F.udf(sort_terms_weights, ArrayType(ArrayType(StringType())))

## Sort termIndices, termWeights and terms
topics = topics.withColumn("sorted_terms_weights", sort_terms_weights_udf(F.col("termIndices"), F.col("termWeights"), F.col("terms")))

topics = topics.withColumn("termIndices", F.col("sorted_terms_weights")[0])\
               .withColumn("termWeights", F.col("sorted_terms_weights")[1])\
               .withColumn("terms", F.col("sorted_terms_weights")[2])\
               .drop("sorted_terms_weights")

topics.show(50, truncate=False)

### Categorize Papers

In [None]:
transformed = lda_model.transform(df_features)

# Define UDF to extract the topic with the highest weight
argmax_udf = F.udf(lambda x: int(np.argmax(x)), IntegerType())

# Apply UDF to the topic distribution column
labeled = transformed.withColumn('topic', argmax_udf('topicDistribution'))
labeled.groupBy('topic').count().orderBy('count', ascending=False).show()

### Visualize Data

#### Topic Occurences

In [None]:
# Convert to Pandas DataFrame for easier plotting
topic_counts = labeled.groupBy('topic').count().toPandas()

# Bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x='topic', y='count', data=topic_counts)
plt.title('Number of Papers by Topic')
plt.tight_layout()
plt.show()

#### Topic Visualization

In [None]:
# Define a function to map word indices to actual words
def map_termID_to_words(termIndices):
    return [vocab[int(index)] for index in termIndices]

# Define a UDF to call the function on an array of term indices
udf_map_termID_to_words = F.udf(map_termID_to_words, ArrayType(StringType()))

# Create a new DataFrame that includes the actual words for each topic
topics_with_words = topics.withColumn("topicWords", udf_map_termID_to_words(topics.termIndices))

# Collect to local Python object for visualization
topic_words = topics_with_words.select("topicWords").rdd.flatMap(lambda x: x).collect()

# Create a word cloud for each topic
for i, words in enumerate(topic_words):
    word_cloud = WordCloud().generate(' '.join(words))
    plt.figure()
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Topic ' + str(i))
    plt.show()
