In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=61a2343ad7935f50d9ac422dfac4dc36f1afc41a6304098acb235df03858344f
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
filepath = "drive/MyDrive/adv_analytics/all_blocks.csv"

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Adv_Analytics').getOrCreate()

In [11]:
prev = spark.read.csv(filepath)

In [12]:
prev

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string]

In [14]:
prev.show(3)

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
|  _c0|  _c1|              _c2|         _c3|         _c4|         _c5|    _c6|   _c7|   _c8|   _c9|   _c10|    _c11|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
|37291|53113|0.833333333333333|           ?|         1.0|           ?|      1|     1|     1|     1|      0|    True|
|39086|47614|                1|           ?|         1.0|           ?|      1|     1|     1|     1|      1|    True|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
only showing top 3 rows



In [None]:
# Doing Schema Inference and Missing Values Set to Null -> the column names are set correctly and the ? strings have been replaced by null values

In [15]:
parsed = spark.read.option("header", "true").option("nullValue", "?").\
option("inferSchema", "true").csv(filepath)

In [16]:
parsed.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: double (nullable = true)
 |-- cmp_fname_c2: double (nullable = true)
 |-- cmp_lname_c1: double (nullable = true)
 |-- cmp_lname_c2: double (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



In [None]:
# Schema Inference does two passes -> one pass to figure out the type of each column, and a second pass to do the actual parsing.
# If you know the schema that you want to use for a file ahead of time, you can create an instance of the pyspark.sql.types.StructType 
# class and pass it to the Reader # API via the schema function. This can have a significant performance benefit when the dataset is 
# very large, since Spark will not need to perform an extra pass over the data to figure out the data type of each column.

In [17]:
#from pyspark.sql.types import *

In [18]:
#schema = StructType([StructField("id_1", IntegerType(), False),
#StructField("id_2", StringType(), False),
#StructField("cmp_fname_c1", DoubleType(), False)])

In [20]:
#parsed2 = spark.read.schema(schema).csv(filepath)

In [None]:
# DataFrames have a number of methods that enable us to read data from the cluster into the PySpark REPL on our client machine.

In [22]:
parsed.first()

Row(id_1=37291, id_2=53113, cmp_fname_c1=0.833333333333333, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=True)

In [None]:
# If we know the dataset is small -> only then we can use the toPandas or collect method to return all the contents of a DataFrame to the client as an array. 

In [None]:
# Analyzing Data with the DataFrame API

In [23]:
# getting an idea of the number of records
parsed.count()

5749132

In [24]:
# Whenever we ask another question -> do another computation, Spark will do these same operations, again and again, even if we have filtered the 
# Analyzing Data with the DataFrame API data down to a small number of records or are working with an aggregated version of the original dataset.
# This isn’t an optimal use of our compute resources. After the data has been parsed once, we’d like to save the data in its parsed form on the 
# cluster so that we don’t have to reparse it every time
parsed.cache()

DataFrame[id_1: int, id_2: int, cmp_fname_c1: double, cmp_fname_c2: double, cmp_lname_c1: double, cmp_lname_c2: double, cmp_sex: int, cmp_bd: int, cmp_bm: int, cmp_by: int, cmp_plz: int, is_match: boolean]

In [25]:
# we want to know is the relative fraction of records that were matches versus those that were nonmatches

from pyspark.sql.functions import col
parsed.groupBy("is_match").count().orderBy(col("count").desc()).show()

+--------+-------+
|is_match|  count|
+--------+-------+
|   false|5728201|
|    true|  20931|
+--------+-------+



In [None]:
# In addition to count, we can also compute more complex aggregations like sums, mins, maxes, means, and 
# standard deviation using the agg method of the DataFrame API in conjunction with the aggregation functions 
# defined in the pyspark.sql.functions collection

from pyspark.sql.functions import avg, stddev
parsed.agg(avg("cmp_sex"), stddev("cmp_sex")).show()