<a href="https://colab.research.google.com/github/blessymoses/spark/blob/master/pySpark_read_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

## Setup PySpark

In [2]:
# download java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install Apache Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz

!tar xf spark-3.2.1-bin-hadoop2.7.tgz

!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"

!pip install -q findspark

import findspark

# locate Spark in the system
findspark.init()

# to know where Spark is installed
findspark.find()

'/content/spark-3.2.1-bin-hadoop2.7/python/pyspark'

## Create Spark Session

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark_session = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [5]:
spark_session

In [7]:
raw_df = spark_session.read \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .csv("sample_data/sample.csv")

In [8]:
raw_df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

In [9]:
partitioned_raw_df = raw_df.repartition(2)

In [10]:
partitioned_raw_df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+----------+-----------------------+---------------------+------------------+---------------+--------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+--

In [11]:
count_df = partitioned_raw_df.filter("Age < 40") \
                .select("Age", "Gender", "Country", "state") \
                .groupBy("Country") \
                .count()

DataFrame[Country: string, count: bigint]

In [12]:
count_df.show()

+--------------+-----+
|       Country|count|
+--------------+-----+
| United States|    4|
|        Canada|    2|
|United Kingdom|    1|
+--------------+-----+



In [13]:
spark_session.stop()