<a href="https://colab.research.google.com/github/blessymoses/PySpark/blob/master/pySpark_3_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

## Setting up PySpark in Colab
Ref: https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/

- To get Apache Spark download site:
  - Choose the Spark release and package type in https://spark.apache.org/downloads.html
  - Clik on the package link and get the download site

In [2]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install Apache Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz

!tar xf spark-3.3.1-bin-hadoop3.tgz

# install findspark to locate Spark on the system and import it as a regular library
!pip install -q findspark

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

In [4]:
!pip install -q findspark

In [5]:
import findspark

# locate Spark in the system
findspark.init()

# to know where Spark is installed
findspark.find()

'/content/spark-3.3.1-bin-hadoop3'

## Create Spark Session

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark_session = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [8]:
spark_session

## create single column spark dataframe using List

In [None]:
ages_list = [10,13,18,34,49,56]

spark_session.createDataFrame(ages_list)

TypeError: ignored

In [None]:
spark_session.createDataFrame(ages_list, "int")

DataFrame[value: int]

In [None]:
from pyspark.sql.types import IntegerType

spark_session.createDataFrame(ages_list, IntegerType())

DataFrame[value: int]

In [None]:
names_list = ["a1", "b1", "c1", "d1", "e1"]

spark_session.createDataFrame(names_list, "string")

DataFrame[value: string]

In [None]:
from pyspark.sql.types import StringType

spark_session.createDataFrame(names_list, StringType())

DataFrame[value: string]

## Create Multi Column Spark Dataframe using list

In [None]:
ages_list = [(21, ), (23, ), (41, ), (32, )]

In [None]:
spark_session.createDataFrame(ages_list)

DataFrame[_1: bigint]

In [None]:
spark_session.createDataFrame(ages_list, "age int")

DataFrame[age: int]

In [None]:
users_list = [(1, "U1"), (2, "U2"), (3, "U3"), (4, "U4")]

In [None]:
spark_session.createDataFrame(users_list)

DataFrame[_1: bigint, _2: string]

In [None]:
df = spark_session.createDataFrame(users_list, "user_id int, first_name string")

## Row

In [None]:
df.show()

+-------+----------+
|user_id|first_name|
+-------+----------+
|      1|        U1|
|      2|        U2|
|      3|        U3|
|      4|        U4|
+-------+----------+



In [None]:
df.collect()

[Row(user_id=1, first_name='U1'),
 Row(user_id=2, first_name='U2'),
 Row(user_id=3, first_name='U3'),
 Row(user_id=4, first_name='U4')]

In [None]:
type(df.collect())

list

In [None]:
from pyspark.sql import Row

In [None]:
r1 = Row("Uname", 11)

In [None]:
r1

<Row('Uname', 11)>

In [None]:
r2 = Row(name="uname2", age=12)

In [None]:
r2

Row(name='uname2', age=12)

In [None]:
r2.name

'uname2'

In [None]:
r2["name"]

'uname2'

In [None]:
users_list = [[1, "u1"], [2, "u2"], [3, "u3"]]

In [None]:
spark_session.createDataFrame(users_list)

DataFrame[_1: bigint, _2: string]

In [None]:
spark_session.createDataFrame(users_list, "user_id int, user_name string")

DataFrame[user_id: int, user_name: string]

In [None]:
users_rows = [Row(*user) for user in users_list]

In [None]:
users_rows

[<Row(1, 'u1')>, <Row(2, 'u2')>, <Row(3, 'u3')>]

In [None]:
spark_session.createDataFrame(users_rows)

DataFrame[_1: bigint, _2: string]

In [None]:
spark_session.createDataFrame(users_rows, "user_id int, user_name string")

DataFrame[user_id: int, user_name: string]

In [9]:
users_list = [(1, "u1"), (2, "u2"), (3, "u3"), (4, "u4")]

In [10]:
spark_session.createDataFrame(users_list)

DataFrame[_1: bigint, _2: string]

In [11]:
spark_session.createDataFrame(users_list, "user_id int, first_name string")

DataFrame[user_id: int, first_name: string]

In [12]:
from pyspark.sql import Row

In [14]:
user_rows = [Row(*r) for r in users_list]
user_rows

[<Row(1, 'u1')>, <Row(2, 'u2')>, <Row(3, 'u3')>, <Row(4, 'u4')>]

In [15]:
spark_session.createDataFrame(user_rows)

DataFrame[_1: bigint, _2: string]

In [16]:
spark_session.createDataFrame(user_rows, "user_id int, first_name string")

DataFrame[user_id: int, first_name: string]

In [17]:
users_list = [
    {"user_id": 1, "first_name": "u1"}, 
    {"user_id": 2, "first_name": "u2"}, 
    {"user_id": 3, "first_name": "u3"}, 
    {"user_id": 4, "first_name": "u4"}
    ]

In [18]:
spark_session.createDataFrame(users_list)

DataFrame[first_name: string, user_id: bigint]

In [19]:
user_rows = [Row(**r) for r in users_list]
user_rows

[Row(user_id=1, first_name='u1'),
 Row(user_id=2, first_name='u2'),
 Row(user_id=3, first_name='u3'),
 Row(user_id=4, first_name='u4')]

In [20]:
spark_session.createDataFrame(user_rows)

DataFrame[user_id: bigint, first_name: string]

In [21]:
spark_session.createDataFrame(user_rows, "user_id int, first_name string")

DataFrame[user_id: int, first_name: string]

In [22]:
spark_session.stop()