<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install pyspark
!pip install --upgrade pyspark

# initiate pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

# download sample
!wget -O Mall_Customers.csv "https://raw.githubusercontent.com/Shahadat9886/Mall_Customers.csv/refs/heads/main/Thu%20H%C3%A0%20Tr%E1%BA%A7n%20-%20Mall_Customers.csv"

# load csv
df = spark.read.csv("/content/Mall_Customers.csv", header=True, inferSchema=True)
df

In [None]:
# use spark.sql
df2 = spark.sql("select 'spark' as hello ")
df2.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [None]:
!wget -O movies.csv "https://raw.githubusercontent.com/Apress/applied-data-science-using-pyspark/refs/heads/main/Ch02/Chapter2_Data/movie_data_part1.csv"
df = spark.read.csv("movies.csv", header=True, sep='|', inferSchema=False)
df.show(10)

In [None]:
# Importing the SparkContext module from PySpark
from pyspark import SparkContext

# Creating a SparkContext object
sc = SparkContext.getOrCreate()

# Importing the random module
import random
NUM_SAMPLES = 100000000 # Define the samples

# Function to check if a point lies inside
def inside(p):
  x, y = random.random(), random.random()
  return x*x + y*y < 1

# parallelize the computation
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()

# calculate the estimate value of pi
pi = 4 * count / NUM_SAMPLES
print("Pi is roughly", pi)

Pi is roughly 3.14192564


In [None]:
df.printSchema()

root
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)



In [None]:
df.dtypes

[('belongs_to_collection', 'string'),
 ('budget', 'string'),
 ('id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('vote_average', 'string')]

In [None]:
print('The total number of records in the movie dataset is '+str(df.count()))

The total number of records in the movie dataset is 43998


In [None]:
#@title Subset Columns

# Defining a list to subset the required columns
select_columns=['id', 'budget', 'popularity', 'release_date', 'revenue','title']

# Subsetting the required columns from the DataFrame
df=df.select(*select_columns)

# The following command displays the data; by default it shows top 20 rows
df.show()

+-----+-------+------------------+------------+-------+--------------------+
|   id| budget|        popularity|release_date|revenue|               title|
+-----+-------+------------------+------------+-------+--------------------+
|43000|      0|             2.503|  1962-05-23|      0|The Elusive Corporal|
|43001|      0|              5.51|  1962-11-12|      0|  Sundays and Cybele|
|43002|      0|              5.62|  1962-05-24|      0|Lonely Are the Brave|
|43003|      0|             7.159|  1975-03-12|      0|          F for Fake|
|43004| 500000|             3.988|  1962-10-09|      0|Long Day's Journe...|
|43006|      0|             3.194|  1962-03-09|      0|           My Geisha|
|43007|      0|             2.689|  1962-10-31|      0|Period of Adjustment|
|43008|      0|             6.537|  1959-03-13|      0|    The Hanging Tree|
|43010|      0|             4.297|  1962-01-01|      0|Sherlock Holmes a...|
|43011|      0|             4.417|  1962-01-01|      0|  Sodom and Gomorrah|

In [5]:
#@title Missing Values

from pyspark.sql.functions import *

df = spark.read.csv("movies.csv", header=True, sep='|', inferSchema=False)
df.filter((df['popularity']=='')|df['popularity'].isNull()|isnan(df['popularity'])).count()

215

In [None]:
df.select([count(when((col(c)=='') | col(c).isNull() |isnan(c), c)).alias(c)
for c in df.columns]).show()

+---------------------+------+---+-----------------+--------------+--------+----------+--------------------+--------------------+------------+-------+-------+------+-------+-----+------------+
|belongs_to_collection|budget| id|original_language|original_title|overview|popularity|production_companies|production_countries|release_date|revenue|runtime|status|tagline|title|vote_average|
+---------------------+------+---+-----------------+--------------+--------+----------+--------------------+--------------------+------------+-------+-------+------+-------+-----+------------+
|                38313|   125|125|              125|           125|     749|       215|                 215|                 283|         221|    215|    304|   304|  23967|  304|         304|
+---------------------+------+---+-----------------+--------------+--------+----------+--------------------+--------------------+------------+-------+-------+------+-------+-----+------------+



In [None]:
df.groupBy(df['title']).count().show()

+--------------------+-----+
|               title|count|
+--------------------+-----+
|   The Corn Is Green|    1|
|Meet The Browns -...|    1|
|Morenita, El Esca...|    1|
| Father Takes a Wife|    1|
|The Werewolf of W...|    1|
|My Wife Is a Gang...|    1|
|Depeche Mode: Tou...|    1|
|  A Woman Is a Woman|    1|
|History Is Made a...|    1|
|      Colombian Love|    1|
|        Ace Attorney|    1|
|     Not Like Others|    1|
|40 Guns to Apache...|    1|
|          Middle Men|    1|
|         It's a Gift|    1|
|    La Vie de Bohème|    1|
|Rasputin: The Mad...|    1|
|The Ballad of Jac...|    1|
|         How to Deal|    1|
|             Freaked|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
df.groupby(df['title']).count().sort(desc("count")).show(10, False)

+--------------------+-----+
|title               |count|
+--------------------+-----+
|NULL                |304  |
|Les Misérables      |8    |
|The Three Musketeers|8    |
|Cinderella          |8    |
|A Christmas Carol   |7    |
|The Island          |7    |
|Dracula             |7    |
|Hamlet              |7    |
|Frankenstein        |7    |
|Cleopatra           |6    |
+--------------------+-----+
only showing top 10 rows



In [None]:
# Subsetting and creating a temporary DataFrame to eliminate any missing values
df_temp = df.filter((df['title']!='')&(df['title'].isNotNull()) & (~isnan(df['title'])))

# Group by 'title', count occurrences, filter titles occurring more than four times, sort in descending order, and display top results
df_temp.groupby(df_temp['title']).count().filter("`count` >4").sort(col("count").desc()).show(10,False)

+--------------------+-----+
|title               |count|
+--------------------+-----+
|Les Misérables      |8    |
|The Three Musketeers|8    |
|Cinderella          |8    |
|A Christmas Carol   |7    |
|The Island          |7    |
|Frankenstein        |7    |
|Dracula             |7    |
|Hamlet              |7    |
|Treasure Island     |6    |
|The Lost World      |6    |
+--------------------+-----+
only showing top 10 rows



In [None]:
# The following command is to find the number of titles that are repeated four times or more
df_temp.groupby(df_temp['title']).count().filter("`count` >=4").sort(col("count").desc()).count()

111

In [None]:
# The following command is to delete any temporary DataFrames that we created in the process
del df_temp

In [None]:
#@title Casting Variables

#Casting
df = df.withColumn('budget',df['budget'].cast("float"))

#After Casting
df.dtypes

[('belongs_to_collection', 'string'),
 ('budget', 'float'),
 ('id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('vote_average', 'string')]

In [6]:
#@title Casting Multiple Columns

#Importing necessary libraries
from pyspark.sql.types import *

#Identifying and assigning lists of variables
int_vars=['id']
float_vars=['budget', 'popularity', 'revenue']
date_vars=['release_date']

#Converting integer variables
for column in int_vars:
  df = df.withColumn(column,df[column].cast(IntegerType()))

# Converting float variables
for column in float_vars:
  df=df.withColumn(column,df[column].cast(FloatType()))

# Converting float variables
for column in date_vars:
  df=df.withColumn(column,df[column].cast(DateType()))

df.dtypes

[('belongs_to_collection', 'string'),
 ('budget', 'float'),
 ('id', 'int'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'date'),
 ('revenue', 'float'),
 ('runtime', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('vote_average', 'string')]

In [None]:
df.show(10, False)

+---------------------+--------+-----+-----------------+------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------

In [None]:
# Descriptive Statics
df.describe().show()

+-------+---------------------+--------------------+------------------+-----------------+-----------------------------------+--------------------+-----------------+--------------------+--------------------+-------------------+------------------+--------+------------------+--------------------+------------------+
|summary|belongs_to_collection|              budget|                id|original_language|                     original_title|            overview|       popularity|production_companies|production_countries|            revenue|           runtime|  status|           tagline|               title|      vote_average|
+-------+---------------------+--------------------+------------------+-----------------+-----------------------------------+--------------------+-----------------+--------------------+--------------------+-------------------+------------------+--------+------------------+--------------------+------------------+
|  count|                 5685|               43873|      

In [None]:
#Since unknown values in budget are marked to be 0, let's filter out those values before calculating the median
df_temp = df.filter((df['budget']!=0)&(df['budget'].isNotNull()) & (~isnan(df['budget'])))

#Here the second parameter indicates the median value, which is 0.5; you can also try adjusting the value to calculate other percentiles
median = df.approxQuantile('budget',[0.5],0.1)

#Printing the Value
print ('The median of budget is '+str(median))

The median of budget is [0.0]


In [None]:
#@title Unique/Distinct Values and Counts

# Counts the distinct occurances of titles
df.agg(countDistinct(col("title")).alias("count")).show()

+-----+
|count|
+-----+
|41138|
+-----+



In [None]:
# Counts the distinct occurrences of titles
df.select('title').distinct().show(10,False)

+---------------------------------------------+
|title                                        |
+---------------------------------------------+
|The Corn Is Green                            |
|Meet The Browns - The Play                   |
|Morenita, El Escandalo                       |
|Father Takes a Wife                          |
|The Werewolf of Washington                   |
|My Wife Is a Gangster                        |
|Depeche Mode: Touring the Angel Live in Milan|
|A Woman Is a Woman                           |
|History Is Made at Night                     |
|Colombian Love                               |
+---------------------------------------------+
only showing top 10 rows



In [None]:
# Extracting year from the release date
df_temp = df.withColumn('release_year',year('release_date'))

# Extracting month
df_temp=df_temp.withColumn('release_month',month('release_date'))

# Extracting day of month
df_temp=df_temp.withColumn('release_day',dayofmonth('release_date'))

# Calculating the distinct counts by the year
df_temp.groupBy("release_year").agg(countDistinct("title")).show(10,False)

+------------+---------------------+
|release_year|count(DISTINCT title)|
+------------+---------------------+
|1959        |271                  |
|1990        |496                  |
|1975        |365                  |
|1977        |415                  |
|1924        |19                   |
|2003        |1199                 |
|2007        |1896                 |
|2018        |4                    |
|1974        |434                  |
|2015        |13                   |
+------------+---------------------+
only showing top 10 rows



In [None]:
#@title Filtering

df.filter(df['title'].like('Meet%')).show(10,False)

+--------------------------------------------------------------------------------------+---------+-----+-----------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----------------------------------

In [None]:
# find the titles that do not end with an “s”
df.filter(~df['title'].like('%s')).show(10,False)

+---------------------+--------+-----+-----------------+------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------

In [7]:
# find any title that contains “ove”
df.filter(df['title'].rlike('\w*ove')).show(10,False)

+---------------------+------+-----+-----------------+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
df.filter(df.title.contains('ove')).show()

+---------------------+------+-----+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+------------+-------+--------+--------------------+--------------------+------------+
|belongs_to_collection|budget|   id|original_language|      original_title|            overview|popularity|production_companies|production_countries|release_date|     revenue|runtime|  status|             tagline|               title|vote_average|
+---------------------+------+-----+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+------------+-------+--------+--------------------+--------------------+------------+
|                 NULL|   0.0|43100|               it|Il generale Della...|The Gestapo force...|     7.252|[{'id': 22396, 'l...|[{'iso_3166_1': '...|  1959-10-07|         0.0|    132|Released|                NULL|General Della Rovere|         7.6|
|       

In [9]:
# identify or subset the columns by a particular prefix or suffix
df.select(df.colRegex("`re\w*`")).printSchema()

root
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)



In [10]:
# identify variables that end with a particular suffix
df.select(df.colRegex("`\w*e`")).printSchema()

root
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- runtime: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)



In [12]:
#@title Creating New Columns

# Calculate the mean popularity
mean_pop = df.agg({'popularity': 'mean'}).collect()[0]['avg(popularity)']
count_obs = df.count()

df = df.withColumn('mean_popularity', lit(mean_pop))
df.show(10)

+---------------------+--------+-----+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+-------+-------+--------+--------------------+--------------------+------------+-----------------+
|belongs_to_collection|  budget|   id|original_language|      original_title|            overview|popularity|production_companies|production_countries|release_date|revenue|runtime|  status|             tagline|               title|vote_average|  mean_popularity|
+---------------------+--------+-----+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+-------+-------+--------+--------------------+--------------------+------------+-----------------+
|                 NULL|     0.0|43000|               fr|  Le Caporal épinglé|The story serves ...|     2.503|[{'id': 16059, 'l...|[{'iso_3166_1': '...|  1962-05-23|    0.0|     90|Released|                NULL|T

In [13]:
# Calculate the variance of the 'popularity' column
df = df.withColumn('varaiance',pow((df['popularity']-df['mean_popularity']),2))
variance_sum = df.agg({'varaiance': 'sum'}).collect()[0]['sum(varaiance)']

variance_population = variance_sum/(count_obs-1)
variance_population

37.85868805766293

In [14]:
# create user-defined function
def new_cols(budget,popularity):
  if budget<10000000: budget_cat='Small'
  elif budget<100000000: budget_cat='Medium'
  else: budget_cat='Big'
  if popularity<3: ratings='Low'
  elif popularity<5: ratings='Mid'
  else: ratings='High'
  return budget_cat,ratings

In [16]:
# Apply the user-defined function on the DataFrame
udfB = udf(new_cols,StructType([StructField("budget_cat", StringType(), True),StructField("ratings", StringType(), True)]))

# pass an user-defined function with two input columns: budget and popularity
temp_df=df.select('id','budget','popularity').withColumn("newcat",udfB("budget","popularity"))

# Unbundle the struct type columns into individual columns and drop the struct type
df_with_newcols = temp_df.select('id', 'budget','popularity', 'newcat').withColumn('budget_cat', temp_df.newcat.getItem('budget_cat')).withColumn('ratings', temp_df.newcat.getItem('ratings')).drop('newcat')
df_with_newcols.show(15, False)

+-----+---------+----------+----------+-------+
|id   |budget   |popularity|budget_cat|ratings|
+-----+---------+----------+----------+-------+
|43000|0.0      |2.503     |Small     |Low    |
|43001|0.0      |5.51      |Small     |High   |
|43002|0.0      |5.62      |Small     |High   |
|43003|0.0      |7.159     |Small     |High   |
|43004|500000.0 |3.988     |Small     |Mid    |
|43006|0.0      |3.194     |Small     |Mid    |
|43007|0.0      |2.689     |Small     |Low    |
|43008|0.0      |6.537     |Small     |High   |
|43010|0.0      |4.297     |Small     |Mid    |
|43011|0.0      |4.417     |Small     |Mid    |
|43012|7000000.0|4.722     |Small     |Mid    |
|43013|0.0      |2.543     |Small     |Low    |
|43014|0.0      |4.303     |Small     |Mid    |
|43015|0.0      |3.493     |Small     |Mid    |
|43016|0.0      |2.851     |Small     |Low    |
+-----+---------+----------+----------+-------+
only showing top 15 rows



In [20]:
# using when function
df_with_newcols = df.select('id', 'budget','popularity').\
withColumn('budget_cat', when(df['budget']<10000000,'Small').when(df['budget']<100000000,'Medium').otherwise('Big')).\
withColumn('ratings', when(df['popularity']<3,'Low').
when(df['popularity']<5,'Mid').otherwise('High'))

df_with_newcols.show(15, False)

+-----+---------+----------+----------+-------+
|id   |budget   |popularity|budget_cat|ratings|
+-----+---------+----------+----------+-------+
|43000|0.0      |2.503     |Small     |Low    |
|43001|0.0      |5.51      |Small     |High   |
|43002|0.0      |5.62      |Small     |High   |
|43003|0.0      |7.159     |Small     |High   |
|43004|500000.0 |3.988     |Small     |Mid    |
|43006|0.0      |3.194     |Small     |Mid    |
|43007|0.0      |2.689     |Small     |Low    |
|43008|0.0      |6.537     |Small     |High   |
|43010|0.0      |4.297     |Small     |Mid    |
|43011|0.0      |4.417     |Small     |Mid    |
|43012|7000000.0|4.722     |Small     |Mid    |
|43013|0.0      |2.543     |Small     |Low    |
|43014|0.0      |4.303     |Small     |Mid    |
|43015|0.0      |3.493     |Small     |Mid    |
|43016|0.0      |2.851     |Small     |Low    |
+-----+---------+----------+----------+-------+
only showing top 15 rows



In [21]:
#@title Delete and Rename Columns

# drop a column or columns using the drop function.
columns_to_drop = ['budget_cat']
df_with_newcols = df_with_newcols.drop(*columns_to_drop)

df_with_newcols.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: float (nullable = true)
 |-- popularity: float (nullable = true)
 |-- ratings: string (nullable = false)



In [22]:
# Rename columns
df_with_newcols = df_with_newcols.withColumnRenamed('id','film_id').withColumnRenamed('ratings','film_ratings')
df_with_newcols.printSchema()

root
 |-- film_id: integer (nullable = true)
 |-- budget: float (nullable = true)
 |-- popularity: float (nullable = true)
 |-- film_ratings: string (nullable = false)



In [23]:
# change multiple columns name

# You can define all the variable changes in the list
new_names = [('budget','film_budget'),('popularity','film_popularity')]

# Applying the alias function
df_with_newcols_renamed = df_with_newcols.select(list(map(lambda old,new:col(old).alias(new),*zip(*new_names))))
df_with_newcols_renamed.printSchema()

root
 |-- film_budget: float (nullable = true)
 |-- film_popularity: float (nullable = true)



In [2]:
#Pandas on spark dataframe
import pyspark.pandas as ps
df_pd_distributed = ps.read_csv("movies.csv",sep="|")
df_pd_distributed[['budget','original_title','popularity']].head()



PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.