In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 46.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=33426a81aab6defcea8433ab2f3f088c98ccef2528341e66185b15799cc2af74
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType, IntegerType
from pyspark.sql.functions import col, split, regexp_replace

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("SMBUD-Project") \
      .getOrCreate()
# master contains the URL of your remote spark instance or 'local'

In [None]:
schema_paper = StructType([
    StructField("id", StringType(), False),
    StructField("title", StringType(), False),
    StructField("year", IntegerType(), True),
    StructField("n_citation", IntegerType(), True),
    StructField("page_start", IntegerType(), True),
    StructField("page_end", IntegerType(), True),
    StructField("doc_type", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("volume", IntegerType(), True),
    StructField("issue", IntegerType(), True),
    StructField("doi", StringType(), True),
    StructField("references", StringType(), True),
    StructField("abstract", StringType(), True),
    StructField("venue_id", LongType(), True), 
    StructField("authors_id", StringType(), True),
    StructField("fos_id", StringType(), True)
])

In [None]:
schema_author = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), False),
    StructField("org", StringType(), True)
])

In [None]:
schema_venue = StructType([
    StructField("id", LongType(), False),
    StructField("name", StringType(), False),
])

In [None]:
schema_fos = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), False),
    StructField("weight", FloatType(), True)
])

In [None]:
paper_df = spark.read.option("header", True).option("delimiter", ",").option("quote", '"').option("escape", '"').schema(schema_paper).csv("paper_spark.csv")
author_df = spark.read.option("header", True).option("delimiter", ",").schema(schema_author).csv("author_spark.csv")
venue_df = spark.read.option("header", True).option("delimiter", ",").schema(schema_venue).csv("venue_spark.csv")
fos_df = spark.read.option("header", True).option("delimiter", ",").schema(schema_fos).csv("fos_spark.csv")
paper_df.show()


+----------+--------------------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|        id|               title|year|n_citation|page_start|page_end|doc_type|           publisher|volume|issue|                 doi|          references|            abstract| venue_id|          authors_id|              fos_id|
+----------+--------------------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
| 101421652|The influence of ...|2003|       139|       397|     423| Journal|Society for Infor...|    27|    3|    10.2307/30036539|[1516261653, 1978...|Managers in moder...| 57293258|[1973614237, 2113...|[101369, 102929, ...|
|1015675232|Research-paper re...|2016|       106|       305|     338| Journal|Springer B

In [None]:
paper_df = paper_df.withColumn("references", split(regexp_replace(col("references"), "[\\[\\]]", ""), ","))

paper_df = paper_df.withColumn("authors_id", split(regexp_replace(col("authors_id"), '[\\[\\]]', ""), ","))

paper_df = paper_df.withColumn("fos_id", split(regexp_replace(col("fos_id"), '[\\[\\]]', ""), ","))


paper_df.show() 
paper_df.printSchema()

+----------+--------------------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|        id|               title|year|n_citation|page_start|page_end|doc_type|           publisher|volume|issue|                 doi|          references|            abstract| venue_id|          authors_id|              fos_id|
+----------+--------------------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
| 101421652|The influence of ...|2003|       139|       397|     423| Journal|Society for Infor...|    27|    3|    10.2307/30036539|[1516261653,  197...|Managers in moder...| 57293258|[1973614237,  211...|[101369,  102929,...|
|1015675232|Research-paper re...|2016|       106|       305|     338| Journal|Springer B

In [None]:
author_df.show()

+-------+--------------------+--------------------+
|     id|                name|                 org|
+-------+--------------------+--------------------+
|  92905|Khan M. Iftekharu...|                null|
| 334463|     Manuel Calimlim|                null|
| 475755|     Brian B. Avants|University of Pen...|
|1167003|  Michele Angelaccio|Dipartimento di I...|
|1336878|Francisco Casacub...| Departament d'En...|
|1532108|        Petr Musilek|Department of Ele...|
|1541821|     Jean Tourrilhes|Hewlett Packard L...|
|1923294|      Rachel Karchin|Johns Hopkins Uni...|
|1998877|      Max Wintermark|                null|
|2049838|        Doina Precup|Univ. of Massachu...|
|2272828|     Daniel B. Szyld|Department of Mat...|
|2352948|    Andrzej Tarlecki|Institute of Info...|
|2452435|    Peter Langfelder|Human Genetics, D...|
|2730879| Alexander G. Parlos|                null|
|2791808|   Gregory R. Ganger|Carnegie Mellon U...|
|2808181|        Øystein Sæbø|Department of Inf...|
|2893888|Mic

In [None]:
venue_df.show()

+-------+--------------------+
|     id|                name|
+-------+--------------------+
| 235346|Wireless Communic...|
| 267729|International Tra...|
| 414566| Pattern Recognition|
|1137746|The Artist and Jo...|
|1250476|Journal of Intern...|
|2181302|     Artificial Life|
|2508752|Business Process ...|
|2642811|ACM Transactions ...|
|2793317|IEEE Personal Com...|
|3084184|Computer Networks...|
|3444786|    Neuroinformatics|
|4357572|Multiscale Modeli...|
|4502562|IEEE Transactions...|
|4650632|International Jou...|
|4921394|Synthesis Lecture...|
|5240358|IEEE Transactions...|
|5405189|Autonomous Agents...|
|5437875|Statistics and Co...|
|5743915|Journal of Comput...|
|6068926|Journal of Roboti...|
+-------+--------------------+
only showing top 20 rows



In [None]:
fos_df.show()

+------+--------------------+----------+
|    id|                name|    weight|
+------+--------------------+----------+
|100000|      1-planar graph|  0.705774|
|100001|               2-opt| 0.6810858|
|100002|              32-bit| 0.4352937|
|100003|3D computer graphics| 0.6133576|
|100004|         3D modeling|0.50386775|
|100005|  3D pose estimation|  0.655267|
|100006|       3D projection|0.44360808|
|100007|   3D reconstruction| 0.5311444|
|100008|        3D rendering|  0.561983|
|100009|3D single-object ...| 0.6807582|
|100010|             3G MIMO|0.71666193|
|100011|                3SUM|  0.539558|
|100012|   56-bit encryption|0.64342797|
|100013|            AC power| 0.4781577|
|100014|              AISoy1|0.65707874|
|100015|                AIXI|0.64854145|
|100016|                 AKA|       0.0|
|100017|               ALARM|0.47861043|
|100018|            ALGOL 68|0.54209197|
|100019|            AND gate|0.41407737|
+------+--------------------+----------+
only showing top

# Data Creation/Update Operations

Drop the column 'year' from paper_df

In [None]:
paper_df.drop(paper_df.year) \
  .show()

+----------+--------------------+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|        id|               title|n_citation|page_start|page_end|doc_type|           publisher|volume|issue|                 doi|          references|            abstract| venue_id|          authors_id|              fos_id|
+----------+--------------------+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
| 101421652|The influence of ...|       139|       397|     423| Journal|Society for Infor...|    27|    3|    10.2307/30036539|[1516261653,  197...|Managers in moder...| 57293258|[1973614237,  211...|[101369,  102929,...|
|1015675232|Research-paper re...|       106|       305|     338| Journal|Springer Berlin H...|    17|    4|1

Drop the columns 'page_start' and 'page_end' from paper_df

In [None]:
paper_df.drop("page_start","page_end") \
  .show()

+----------+--------------------+----+----------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|        id|               title|year|n_citation|doc_type|           publisher|volume|issue|                 doi|          references|            abstract| venue_id|          authors_id|              fos_id|
+----------+--------------------+----+----------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
| 101421652|The influence of ...|2003|       139| Journal|Society for Infor...|    27|    3|    10.2307/30036539|[1516261653,  197...|Managers in moder...| 57293258|[1973614237,  211...|[101369,  102929,...|
|1015675232|Research-paper re...|2016|       106| Journal|Springer Berlin H...|    17|    4|10.1007/s00799-01...|[1971040550,  201...|In the last 16 ye...|110615584|[20

Insert an author in paper_df

In [None]:
newRow = spark.createDataFrame([(1882405, "Elon Musk", "Tesla")], ["id","name","org"])

new_df = author_df.union(newRow)
new_df.filter(new_df.name == "Elon Musk").show(truncate = False)

+-------+---------+-----+
|id     |name     |org  |
+-------+---------+-----+
|1882405|Elon Musk|Tesla|
+-------+---------+-----+



Update the title of a paper with a particular id 

In [None]:
from pyspark.sql.functions import col, when

new_value = "New_Title"
new_paper_df = paper_df.withColumn(
    "title",
    when(
        col("id") == 101421652,
        new_value
    )
)

new_paper_df.filter(new_paper_df.id == 101421652).show()

+---------+---------+----+----------+----------+--------+--------+--------------------+------+-----+----------------+--------------------+--------------------+--------+--------------------+--------------------+
|       id|    title|year|n_citation|page_start|page_end|doc_type|           publisher|volume|issue|             doi|          references|            abstract|venue_id|          authors_id|              fos_id|
+---------+---------+----+----------+----------+--------+--------+--------------------+------+-----+----------------+--------------------+--------------------+--------+--------------------+--------------------+
|101421652|New_Title|2003|       139|       397|     423| Journal|Society for Infor...|    27|    3|10.2307/30036539|[1516261653,  197...|Managers in moder...|57293258|[1973614237,  211...|[101369,  102929,...|
+---------+---------+----+----------+----------+--------+--------+--------------------+------+-----+----------------+--------------------+------------------

Update the title of all the papers with more than 100 pages

In [None]:
new_value = "New_Title"
new_paper_df = paper_df.withColumn(
    "title",
    when(
        col("page_end") - col("page_start") > 100,
        new_value
    )
)

new_paper_df.filter(new_paper_df.title == new_value).show()

+----------+---------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+----------+--------------------+--------------------+
|        id|    title|year|n_citation|page_start|page_end|doc_type|           publisher|volume|issue|                 doi|          references|            abstract|  venue_id|          authors_id|              fos_id|
+----------+---------+----+----------+----------+--------+--------+--------------------+------+-----+--------------------+--------------------+--------------------+----------+--------------------+--------------------+
|1595507819|New_Title|1998|       599|         7|     149| Journal|           Routledge|    15|    3|10.1080/07421222....|        [1972098965]|By mid-1998, appr...|   9954729|[2077262051,  166...|[100963,  103181,...|
|1938444229|New_Title|2015|       135|         1|     230| Journal|      now Publishers|     8|    1|  10.1561/2200000048|[19993

# Queries

In [None]:
from pyspark.sql.functions import col, explode, desc, asc, count, collect_list

1 - WHERE, JOIN

> Retrieve the papers which have at least Computer Science among their fos.

In [None]:
paper_df.select(col("id"), col("title"), col("year"), col("n_citation"), col("publisher"), col("references"), col("abstract"), col("venue_id"), col("authors_id"), explode(col("fos_id"))) \
.withColumnRenamed("col", "fos_id") \
.withColumnRenamed("id", "paper_id") \
.join(fos_df, fos_df.id == col("fos_id"), "fullouter").filter(col("name") == "Computer science") \
.select(col("paper_id"), col("title"), col("year"), col("n_citation"), col("publisher"), col("references"), col("venue_id"), col("authors_id"), col("fos_id")) \
.show()

+----------+--------------------+----+----------+--------------------+--------------------+----------+--------------------+------+
|  paper_id|               title|year|n_citation|           publisher|          references|  venue_id|          authors_id|fos_id|
+----------+--------------------+----+----------+--------------------+--------------------+----------+--------------------+------+
|  86453134|IDDQ testing: A r...|1992|       169|         Springer US|                  []| 200807567|[2112919840,  214...|100963|
| 767067438|A review of the l...|2016|       142|            Elsevier|[1982564000,  198...| 205292342|        [2140778852]|100963|
|  69854901|Survey A survey o...|2012|       160|Elsevier North-Ho...|[2003992171,  200...|  63392143|[2119852781,  318...|100963|
| 639708223|Faster R-CNN: Tow...|2017|      2586|                IEEE|[1958328135,  203...| 199944782|[2119543935,  216...|100963|
| 433644524|Review: Intrusion...|2013|       271| Academic Press Ltd.|[2007087405, 

2 - WHERE, LIMIT, LIKE

> Retrieve papers that have more than 400 citations and of any IEEE publisher (i.e. the publisher must be "IEEE", or "IEEE" concatenated with other strings, for example "IEEE TRANSACTIONS ON INDUSTRIAL INFORMATICS"). Then take only the first 15 results.



In [None]:
paper_df.filter((col("n_citation") >= 400) & (col("publisher").like("IEEE%"))) \
.limit(15) \
.select(col("id"), col("title"), col("n_citation"), col("publisher"), col("abstract")) \
.show()

+----------+--------------------+----------+--------------------+--------------------+
|        id|               title|n_citation|           publisher|            abstract|
+----------+--------------------+----------+--------------------+--------------------+
|1510186039|Graspit! A versat...|       521|                IEEE|A robotic graspin...|
|1612997784|ORB-SLAM: A Versa...|       619|                IEEE|This paper presen...|
|1641498739|The Multimodal Br...|       417|                IEEE|In this paper we ...|
|1885185971|Image Super-Resol...|       641|                IEEE|We propose a deep...|
|1910657905|SegNet: A Deep Co...|       551|                IEEE|We present a nove...|
|1963932623|Label Consistent ...|       469|                IEEE|A label consisten...|
|1964402820|Regular and irreg...|       992|                IEEE|We propose a gene...|
|1964793896|OFDM Versus Filte...|       598|                IEEE|As of today, orth...|
|1965497192|Network-Induced C...|       500

3 - WHERE, IN, Nested Query

> Retrieve the papers of conferences since 2003.



In [None]:
paper_df.filter((col("id") \
  .isin(paper_df.filter(col("doc_type") == "Conference") \
  .select(collect_list("id")).collect()[0][0])) & (col("year") > 2003)) \
.select(col("id"), col("title"), col("year"), col("doc_type"), col("abstract"), col("venue_id"), col("authors_id"), col("fos_id")) \
.show(10)

+----------+--------------------+----+----------+--------------------+----------+--------------------+--------------------+
|        id|               title|year|  doc_type|            abstract|  venue_id|          authors_id|              fos_id|
+----------+--------------------+----+----------+--------------------+----------+--------------------+--------------------+
|1485941238|Succinct suffix a...|2005|Conference|A succinet full-t...|1192804102|[2026327678,  213...|[103247,  101525,...|
|1515520450|Data integration ...|2004|Conference|Data integration ...|1134069326|[2781243587,  198...|[101294,  102665,...|
|1516360493|Global Progress i...|2008|Conference|A multiparty sess...|1145706541|[2035671728,  214...|[100963,  101552,...|
|1537747159|Data Clustering: ...|2008|Conference|The practice of c...|2755314191|        [2162010601]|[100763,  105371,...|
|1580065766|Symmetry in 3D Ge...|2013|Conference|The concept of sy...|2754362256|[2136233650,  212...|[100970,  100270,...|
|1594489

4 - GROUP BY, 1 JOIN, AS

> Compute how many papers were published, for each year, on each venue, showing also the venue name. Display results ordered by year in descendent order.

In [None]:
paper_df.join(venue_df, venue_df.id == paper_df.venue_id) \
.groupBy("year", "venue_id", "name") \
.agg(count("name").alias("n_publications")) \
.orderBy(desc("year")) \
.show(20, truncate = False)

+----+----------+--------------------------------------------------------------+--------------+
|year|venue_id  |name                                                          |n_publications|
+----+----------+--------------------------------------------------------------+--------------+
|2018|199944782 |IEEE Transactions on Pattern Analysis and Machine Intelligence|1             |
|2018|160107561 |Siam Review                                                   |1             |
|2017|61310614  |IEEE Transactions on Information Forensics and Security       |1             |
|2017|134177497 |IEEE Transactions on Fuzzy Systems                            |1             |
|2017|76152103  |IEEE Transactions on Systems, Man, and Cybernetics            |2             |
|2017|25538012  |International Journal of Computer Vision                      |1             |
|2017|7560371   |Information Fusion                                            |1             |
|2017|193920097 |Mathematical Programmin

5 - WHERE, GROUP BY

> Compute how many papers were published, for each year, by the IEEE and with a number of citations greater than 500.

In [None]:
paper_df.filter((col("publisher") == "IEEE") & (col("n_citation") > 500)) \
.groupBy("year", "doc_type") \
.agg(count("year").alias("n_years")) \
.orderBy(desc("year")) \
.show(20, truncate = False)

+----+----------+-------+
|year|doc_type  |n_years|
+----+----------+-------+
|2018|Journal   |1      |
|2017|Journal   |2      |
|2016|Journal   |2      |
|2015|Journal   |5      |
|2014|Journal   |14     |
|2013|Journal   |12     |
|2012|Conference|2      |
|2012|Journal   |11     |
|2011|Journal   |23     |
|2011|Conference|1      |
|2010|Journal   |18     |
|2010|Conference|1      |
|2009|Conference|2      |
|2009|Journal   |16     |
|2008|Journal   |14     |
|2008|Conference|2      |
|2007|Journal   |18     |
|2007|Conference|3      |
|2006|Journal   |14     |
|2005|Journal   |14     |
+----+----------+-------+
only showing top 20 rows



6 - GROUP BY, HAVING, AS

> Retrieve the publishers that have more than 100 papers stored in the database.





In [None]:
paper_df.groupBy("publisher") \
.agg(count("id").alias("n_papers")) \
.filter(col("n_papers")>=100) \
.show(truncate = False)

+-----------------------------------------------------------------------------------------+--------+
|publisher                                                                                |n_papers|
+-----------------------------------------------------------------------------------------+--------+
|Society for Information Management and The Management Information Systems Research Center|105     |
|Elsevier                                                                                 |565     |
|Elsevier Science Inc.                                                                    |193     |
|Elsevier Science Publishers Ltd.                                                         |108     |
|ACM                                                                                      |1104    |
|Kluwer Academic Publishers                                                               |277     |
|Springer US                                                                              |

7 - WHERE, GROUP BY, HAVING, AS

> Retrieve the years in which the number of published paper of journals is greater than 150.

In [None]:
paper_df.filter(col("doc_type") == "Journal") \
.groupBy("year") \
.agg(count("id").alias("n_papers")) \
.orderBy(asc("year")) \
.filter(col("n_papers") > 150) \
.show(truncate = False)

+----+--------+
|year|n_papers|
+----+--------+
|1998|158     |
|2000|187     |
|2001|181     |
|2002|231     |
|2003|309     |
|2004|296     |
|2005|421     |
|2006|422     |
|2007|464     |
|2008|485     |
|2009|505     |
|2010|559     |
|2011|574     |
|2012|478     |
|2013|466     |
|2014|323     |
|2015|188     |
+----+--------+



8 - WHERE, Nested Query (i.e., 2-step Queries), GROUP BY

> Compute the total number of citations, for each publisher, of the papers that have a field of study with weight grater than 0.6



In [None]:
paper_df.select(col("id"), col("title"), col("year"), col("n_citation"), col("doi"), col("publisher"), explode(paper_df.fos_id)) \
.withColumnRenamed("col", "fos") \
.filter(col("fos") \
  .isin(fos_df.filter(col("weight") > 0.6) \
  .select(collect_list("id")) \
  .collect()[0][0])) \
.groupBy(col("publisher")) \
.sum("n_citation") \
.withColumnRenamed("sum(n_citation)", "total_citations") \
.show(20, truncate = False)


+------------------------------------------------------------------------------------+---------------+
|publisher                                                                           |total_citations|
+------------------------------------------------------------------------------------+---------------+
|Cambridge University Press                                                          |374            |
|捷頂科技有限公司                                                                    |112            |
|Wiley-Blackwell                                                                     |478            |
|John Wiley & Sons, Ltd.                                                             |345            |
|Elsevier                                                                            |2339           |
|Kluwer Academic Publishers-Plenum Publishers                                        |463            |
|Decision Support Systems                                                        

9 - WHERE, GROUP BY, HAVING, 1 JOIN

> Retrieve the authors whose organization is the Stanford University and that wrote a number of papers greater than 1.

In [None]:
paper_df.select(col("id"), col("title"), col("year"), col("n_citation"), col("page_start"), col("page_end"), col("doc_type"), col("publisher"), col("volume"), col("issue"), col("doi"), col("references"), col("abstract"), col("venue_id"), explode(col("authors_id")), col("fos_id")) \
.withColumnRenamed("col", "authors_id") \
.withColumnRenamed("id", "paper_id") \
.join(author_df, author_df.id == col("authors_id"), "full") \
.filter(col("org") == "Stanford University") \
.groupBy("authors_id") \
.agg(count("paper_id").alias("n_papers")) \
.orderBy(asc("n_papers")) \
.filter(col("n_papers") > 1) \
.show(truncate = False)

+----------+--------+
|authors_id|n_papers|
+----------+--------+
|2149433985|2       |
|201828038 |4       |
|348630313 |4       |
+----------+--------+



10 - WHERE, GROUP BY, HAVING, 2 JOINs

> Retrieve the venues on which an organization has published more than 4 papers, that consist of more than 10 pages.

In [None]:
paper_df.filter((col("page_end") - col("page_start") > 10)) \
.select(col("id").alias("paper_id"), col("venue_id"), col("volume"), explode(paper_df.authors_id).alias("author_id")) \
.join(author_df, author_df.id == col("author_id")).drop("id").withColumnRenamed("name", "author_name") \
.join(venue_df, venue_df.id == paper_df.venue_id).drop("id").withColumnRenamed("name", "venue_name") \
.groupBy(col("venue_id"), col("venue_name"), col("org")) \
.agg(count("paper_id").alias("n_papers")) \
.filter((col("n_papers") >= 4) & (col("org").isNotNull())) \
.sort(desc("n_papers")) \
.show()

+----------+--------------------+--------------------+--------+
|  venue_id|          venue_name|                 org|n_papers|
+----------+--------------------+--------------------+--------+
| 134177497|IEEE Transactions...|Space Control & I...|       9|
| 134177497|IEEE Transactions...|Department of Mat...|       7|
|1127352206|programming langu...|Microsoft Researc...|       5|
| 157921468|ACM Computing Sur...|Columbia Univ., N...|       5|
|1152462849|acm special inter...|University of Ill...|       5|
| 168680287|IEEE Transactions...|Technion - Israel...|       4|
|  57293258|Management Inform...|Fox School of Bus...|       4|
|  90119964|ACM Transactions ...|Univ. of Maryland...|       4|
|  25538012|International Jou...|Stanford Universi...|       4|
|  63459445|IEEE Transactions...|Dept. of Electr. ...|       4|
| 199944782|IEEE Transactions...|Nat. Lab. of Patt...|       4|
|    414566| Pattern Recognition|Department of Com...|       4|
|  93787993|IEEE Transactions...|Departm