Part 1: Initialize PySpark

In [None]:
!pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=483f8c6f09e8451f47b05de9d6545105a7e5443b13a5ac478d07a5892ba609db
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Tutorial1_CN7030") \
                            .master("local[*]")\
                            .config("spark.some.config.option", "some-value") \
                            .getOrCreate()

Part 2: Create PySpark DataFrame with an explicit schema

In [None]:
# PySpark DataFrame with Explicit Schema
df = spark.createDataFrame([
    (1, 87.0, 'Mike', 'math', 'passed'),
    (2, 60.5, 'Mike', 'computing', 'failed'),
    (3, 20.8, 'Mina', 'networking', 'passed'),
    (4, 41.0, 'Emmy', 'math', 'failed'),
    (5, 39.0, 'Alex', 'computing', 'failed'),
    (6, 55.8, 'Alex', 'AI', 'passed'),
    (7, 74.0, 'Emmy', 'AI', 'passed'),
    ], schema = 'ID int, Mark double, Name string, Lesson string, Status string')

# show table
df.show()

# show schema
df.printSchema()

+---+----+----+----------+------+
| ID|Mark|Name|    Lesson|Status|
+---+----+----+----------+------+
|  1|87.0|Mike|      math|passed|
|  2|60.5|Mike| computing|failed|
|  3|20.8|Mina|networking|passed|
|  4|41.0|Emmy|      math|failed|
|  5|39.0|Alex| computing|failed|
|  6|55.8|Alex|        AI|passed|
|  7|74.0|Emmy|        AI|passed|
+---+----+----+----------+------+

root
 |-- ID: integer (nullable = true)
 |-- Mark: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Lesson: string (nullable = true)
 |-- Status: string (nullable = true)



In [None]:
data = [[295, "South Bend", "Indiana",  101190, 112.9]]
columns = ["rank", "city", "state",  "population", "price"]

df1 = spark.createDataFrame(data, schema="rank LONG, city STRING, state STRING,  population LONG, price DOUBLE")
display(df1)
df1.show()

DataFrame[rank: bigint, city: string, state: string, population: bigint, price: double]

+----+----------+-------+----------+-----+
|rank|      city|  state|population|price|
+----+----------+-------+----------+-----+
| 295|South Bend|Indiana|    101190|112.9|
+----+----------+-------+----------+-----+



Part 3: Create PySpark DataFrame from Pandas DataFrame

In [None]:
import numpy as np
import pandas as pd
df_pandas = pd.DataFrame(np.random.randint(0,200,size=(100000, 5)), columns=list('ABCDE'))

In [None]:
df2 = spark.createDataFrame(df_pandas)

# show table
df2.show()

# show schema
df2.printSchema()

+---+---+---+---+---+
|  A|  B|  C|  D|  E|
+---+---+---+---+---+
| 85| 60| 47|147| 56|
| 73|138|168| 26| 52|
|137|165|172|186| 44|
| 52| 72|198|131|139|
|113| 35|182|140| 69|
| 92|128|119| 88| 78|
|132| 89| 40|120|199|
|191|185| 64|113|139|
| 35| 42| 59|  4|179|
|101|194| 99| 14|157|
| 91| 48| 68|133|151|
| 76|101|118|146| 79|
|128| 77|  8|107|196|
| 49| 51|188| 52|111|
|191|136|150| 14|164|
| 63|170| 57| 69|147|
|132| 43|184| 67| 88|
|145|166|135| 71| 86|
|189| 64|150| 74| 95|
|105| 18| 48|134| 59|
+---+---+---+---+---+
only showing top 20 rows

root
 |-- A: long (nullable = true)
 |-- B: long (nullable = true)
 |-- C: long (nullable = true)
 |-- D: long (nullable = true)
 |-- E: long (nullable = true)



In [None]:
print(df2.count())

100000


In [None]:
# Returns a DataFrame that combines the rows of df1 and df
print("rows in df",df.count())
df.show()
print("rows in df1",df1.count())
df1.show()
df3 = df1.union(df)
print("rows in df3")
print(df3.count())
df3.show()
print("showing the data frame from the bottom")
df3.tail(7)

rows in df 7
+---+----+----+----------+------+
| ID|Mark|Name|    Lesson|Status|
+---+----+----+----------+------+
|  1|87.0|Mike|      math|passed|
|  2|60.5|Mike| computing|failed|
|  3|20.8|Mina|networking|passed|
|  4|41.0|Emmy|      math|failed|
|  5|39.0|Alex| computing|failed|
|  6|55.8|Alex|        AI|passed|
|  7|74.0|Emmy|        AI|passed|
+---+----+----+----------+------+

rows in df1 1
+----+----------+-------+----------+-----+
|rank|      city|  state|population|price|
+----+----------+-------+----------+-----+
| 295|South Bend|Indiana|    101190|112.9|
+----+----------+-------+----------+-----+

rows in df3
8
+----+----------+-------+----------+------+
|rank|      city|  state|population| price|
+----+----------+-------+----------+------+
| 295|South Bend|Indiana|    101190| 112.9|
|   1|      87.0|   Mike|      math|passed|
|   2|      60.5|   Mike| computing|failed|
|   3|      20.8|   Mina|networking|passed|
|   4|      41.0|   Emmy|      math|failed|
|   5|      39.0

[Row(rank=1, city='87.0', state='Mike', population='math', price='passed'),
 Row(rank=2, city='60.5', state='Mike', population='computing', price='failed'),
 Row(rank=3, city='20.8', state='Mina', population='networking', price='passed'),
 Row(rank=4, city='41.0', state='Emmy', population='math', price='failed'),
 Row(rank=5, city='39.0', state='Alex', population='computing', price='failed'),
 Row(rank=6, city='55.8', state='Alex', population='AI', price='passed'),
 Row(rank=7, city='74.0', state='Emmy', population='AI', price='passed')]

How many partitions? - To find the partitions

In [None]:
Totpartition=df.rdd.getNumPartitions()
print(Totpartition)
newpartitiondf = df.repartition(4)

print(newpartitiondf.rdd.getNumPartitions())

# Repatition by column name into 4 partitions, check the df1 it has a column name city we want it to be in four paritition
df4 = df1.repartition(4, "city")
print(df4.rdd.getNumPartitions())
# Repatition by multiple columns
#df5 = df4.repartition("ColumnName1","ColumnName2")


2
4
4


In [None]:
myfile="/content/CompleteDataset.csv"
df_fifa = spark.read.format("csv").load(myfile, inferSchema=True, header=True)

In [None]:
# show table
df_fifa.show(truncate = True)

# show schema
df_fifa.printSchema()

+---+-----------------+---+--------------------+-----------+--------------------+-------+---------+-------------------+--------------------+------+-----+-------+------------+----------+-------+-------+------------+---------+--------+-----+---------+---------+------------------+---------+-----------+----------+--------------+-----------+----------------+-------------+-------+------------+----------+-------+---------+-----------+---------+-------------+----------+--------------+------------+-------+---------------+--------+------+-------+----+----+----+----+----+------+----+----+----+----+----+----+----+----+----+----+-------------------+----+----+----+----+----+----+----+----+----+----+----+
|_c0|             Name|Age|               Photo|Nationality|                Flag|Overall|Potential|               Club|           Club Logo| Value| Wage|Special|Acceleration|Aggression|Agility|Balance|Ball control|Composure|Crossing|Curve|Dribbling|Finishing|Free kick accuracy|GK diving|GK handling|

In [None]:
print(df_fifa.count())
print(len(df_fifa.columns))

17981
75


In [None]:
# Count rows using rdd attribute
row_count = df_fifa.rdd.count()

print(f'The DataFrame has {row_count} rows.')

The DataFrame has 17981 rows.


In [None]:
# Change the number of partitions
df_fifa = df_fifa.repartition(4)
df_fifa.rdd.getNumPartitions()


4

In [None]:
# showing Age more than  20
subset_Age = df_fifa.filter(df_fifa["Age"] > 30)
subset_Age.show(10)

+-----+-------------+---+--------------------+-------------+--------------------+-------+---------+--------------------+--------------------+-----+----+-------+------------+----------+-------+-------+------------+---------+--------+-----+---------+---------+------------------+---------+-----------+----------+--------------+-----------+----------------+-------------+-------+------------+----------+-------+---------+-----------+---------+-------------+----------+--------------+------------+-------+---------------+--------+------+-------+----+----+----+----+----+------+----+----+----+----+----+----+----+----+----+----+-------------------+----+----+----+----+----+----+----+----+----+----+----+
|  _c0|         Name|Age|               Photo|  Nationality|                Flag|Overall|Potential|                Club|           Club Logo|Value|Wage|Special|Acceleration|Aggression|Agility|Balance|Ball control|Composure|Crossing|Curve|Dribbling|Finishing|Free kick accuracy|GK diving|GK handling|GK

In [None]:
df_fifa.select("Name","Club").distinct().show(100)

+---------------+--------------------+
|           Name|                Club|
+---------------+--------------------+
|      C. Vargas|Atletico Nacional...|
|Simão Donatinho|Clube Atlético Pa...|
|   L. Coulibaly|          Angers SCO|
|      H. Osorio|Independiente San...|
|  M. Migliorini|            Avellino|
|    D. Petković|          FC Lorient|
|       O. Şahan|         Trabzonspor|
|      G. Donsah|             Bologna|
|      A. Sukhov|              FC Ufa|
|      A. Mawson|        Swansea City|
|      K. Kamara|New England Revol...|
|      P. McGinn|Partick Thistle F.C.|
|        I. Boye|           Örebro SK|
|         S. Old|           Morecambe|
|  M. Villasanti|           Temperley|
|       L. Nolan|  Accrington Stanley|
|     J. Joronen|          AC Horsens|
|    T. McCarron|          Finn Harps|
|       J. Terry|         Aston Villa|
|      F. Kessié|               Milan|
|     S. Ulreich|    FC Bayern Munich|
|      A. Stokes|           Hibernian|
|       G. Torje|Kardemir