# Analysis in PySpark

https://www.codecademy.com/courses/big-data-pyspark/lessons/pyspark-sql-lesson/exercises/querying-pyspark-dataframes

In [10]:
from pyspark.sql import SparkSession

# Create a New SparkSession
spark = SparkSession \
    .builder \
    .appName("learning_spark_sql") \
    .getOrCreate()

# Read in Wikipedia Unique Visitors Dataset
wiki_uniq_df = spark.read\
    .option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv("wiki_uniq_march_2022_w_site_type.csv")

## Filter data

In [11]:
wiki_uniq_df.show(5)

+------------------+-------------------+-----------------+-------------------+-------------+---------+
|            domain|uniq_human_visitors|uniq_bot_visitors|total_visitor_count|language_code|site_type|
+------------------+-------------------+-----------------+-------------------+-------------+---------+
|en.m.wikipedia.org|           33261399|          8400247|           41661646|           en|wikipedia|
|  en.wikipedia.org|           17009339|          4851741|           21861080|           en|wikipedia|
|es.m.wikipedia.org|            5668575|          1977289|            7645864|           es|wikipedia|
|ru.m.wikipedia.org|            5816762|          1367179|            7183941|           ru|wikipedia|
|ja.m.wikipedia.org|            5396108|          1325212|            6721320|           ja|wikipedia|
+------------------+-------------------+-----------------+-------------------+-------------+---------+
only showing top 5 rows



In [25]:
ar_site_visitors = wiki_uniq_df.filter(wiki_uniq_df.language_code == 'ar')
ar_site_visitors.show()

+--------------------+-------------------+-----------------+-------------------+-------------+-----------+
|              domain|uniq_human_visitors|uniq_bot_visitors|total_visitor_count|language_code|  site_type|
+--------------------+-------------------+-----------------+-------------------+-------------+-----------+
|  ar.m.wikipedia.org|            1644253|           750620|            2394873|           ar|  wikipedia|
|    ar.wikipedia.org|             212695|            97700|             310395|           ar|  wikipedia|
| ar.m.wikisource.org|              56124|            52885|             109009|           ar| wikisource|
|   ar.wikisource.org|               2134|             4355|               6489|           ar| wikisource|
|  ar.m.wikiquote.org|                776|             3511|               4287|           ar|  wikiquote|
|   ar.wiktionary.org|                262|             2335|               2597|           ar| wiktionary|
| ar.m.wiktionary.org|               

In [26]:
ar_visitors_slim = ar_site_visitors.select(['domain', 'uniq_human_visitors'])
ar_visitors_slim.show()

+--------------------+-------------------+
|              domain|uniq_human_visitors|
+--------------------+-------------------+
|  ar.m.wikipedia.org|            1644253|
|    ar.wikipedia.org|             212695|
| ar.m.wikisource.org|              56124|
|   ar.wikisource.org|               2134|
|  ar.m.wikiquote.org|                776|
|   ar.wiktionary.org|                262|
| ar.m.wiktionary.org|                448|
|ar.m.wikiversity.org|                389|
|  ar.m.wikibooks.org|                378|
+--------------------+-------------------+



## Calculate the sum of all uniq_human_visitors

In [33]:
top_visitors_site_type = wiki_uniq_df.select(['site_type', 'uniq_human_visitors']).groupby('site_type').sum().orderBy('sum(uniq_human_visitors)', ascending=False)
top_visitors_site_type.show()

+-----------+------------------------+
|  site_type|sum(uniq_human_visitors)|
+-----------+------------------------+
|  wikipedia|               116527479|
| wiktionary|                  892193|
|  wikimedia|                  312995|
| wikisource|                  172179|
|   wikidata|                   69744|
|  wikibooks|                   54680|
|  wikiquote|                   38048|
| wikivoyage|                   14648|
|       wiki|                   13067|
|wikiversity|                   12548|
|   wikinews|                    5578|
|   wikitech|                     751|
+-----------+------------------------+

