In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('instance').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 21:02:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql import Row
import datetime
import pandas as pd

users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "gender": "male",
        "current_city": "Dallas",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "gender": "male",
        "current_city": "Huston",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "gender": "female",
        "current_city": "",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "gender": "male",
        "current_city": "San Francisco",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "gender": "female",
        "current_city": None,
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]
users_df = spark.createDataFrame(pd.DataFrame(users));
users_df.show(truncate=False)
users_df.printSchema()

                                                                                

+---+----------+------------+-------------------------+------+-------------+----------------------------------+-------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |gender|current_city |phone_numbers                     |courses|is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+------+-------------+----------------------------------+-------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy.com    |male  |Dallas       |{+1 234 567 8901, +1 234 567 8911}|[1, 2] |true       |1000.55    |2021-01-15   |2021-02-10 01:15:00|
|2  |Nikolaus  |Brewitt     |nbrewitt1@dailymail.co.uk|male  |Huston       |{+1 234 567 8923, 1 234 567 8934} |[3]    |true       |900.0      |2021-02-14   |2021-02-18 03:33:00|
|3  |Orelie    |Penney      |openney2@vistaprint.com  |female|             |{+1 714 512 9752, +1 714 512 6601}

## Overview of sorting a Spark Data Frame
* sort data using ascending order by a specific column
* sort data using descending order by a specific column
* dealing with nulls while sorting the data (having the nulls values at the beginning or at the end
* sorting the data using multiple columns (coposite sorting), mixed - ascending on one column and descending on the other column, end vice versa.
* perform prioritetizing sorting, e.g. we want to have USA first and then the other by their respective names

In [8]:
from pyspark.sql.functions import col
# sort users data in ascending order by first name
users_df.sort('first_name').show()
users_df.sort(col('first_name')).show()
users_df.sort(users_df.first_name).show()
users_df.sort(users_df['first_name']).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|  male|San Francisco|        {NULL, NULL}|     []|      false|        NaN|         NULL|2021-04-10 17:45:30|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  5|      Kurt|        Rome|krome4@shutterfly...|female|         NULL|{+1 817 934 7142,...|     []|      false|        NaN|         NULL|2021-04-02 00:55:18|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma

In [9]:
users_df.sort(col('customer_from')).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  5|      Kurt|        Rome|krome4@shutterfly...|female|         NULL|{+1 817 934 7142,...|     []|      false|        NaN|         NULL|2021-04-02 00:55:18|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|  male|San Francisco|        {NULL, NULL}|     []|      false|        NaN|         NULL|2021-04-10 17:45:30|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  3|    Orelie|      Penney|openney2@vistapri

In [20]:
#sort data in ascending order by numer of enrolled courses
from pyspark.sql.functions import size

users_df.sort(size('courses')).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|  male|San Francisco|        {NULL, NULL}|     []|      false|        NaN|         NULL|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|female|         NULL|{+1 817 934 7142,...|     []|      false|        NaN|         NULL|2021-04-02 00:55:18|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|       Huston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy

## Sort Spark Data Frame by column in descending order

In [30]:
users_df.sort('first_name', ascending=False).show()
users_df.sort(users_df.first_name.desc()).show()

from pyspark.sql.functions import col
users_df.sort(col('first_name').desc()).show()

from pyspark.sql.functions import desc
users_df.sort(desc('first_name')).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|       Huston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  5|      Kurt|        Rome|krome4@shutterfly...|female|         NULL|{+1 817 934 7142,...|     []|      false|        NaN|         NULL|2021-04-02 00:55:18|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy

In [24]:
users_df.sort(users_df['customer_from'].desc()).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|       Huston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  4|     Ashby|    Maddocks|  amaddocks3@home

In [25]:
users_df.sort(desc(size('courses'))).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|       Huston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  4|     Ashby|    Maddocks|  amaddocks3@home

In [26]:
users_df.sort(desc(size('courses'))).show()
users_df. \
    withColumn('num_of_courses', size('courses')). \
    sort(col('num_of_courses').desc()). \
    show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|       Huston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  4|     Ashby|    Maddocks|  amaddocks3@home

## Dealing with NULL's while sorting Spark Data Frame

In [37]:
# by default: asc - nulls first, desc - nulls last

from pyspark.sql.functions import col, asc_nulls_last, desc_nulls_first

users_df.orderBy(col('customer_from')).show()
users_df.orderBy(col('customer_from').asc_nulls_last()).show()
users_df.orderBy(asc_nulls_last('customer_from')).show()
users_df.orderBy(desc_nulls_first('customer_from')).show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|  male|San Francisco|        {NULL, NULL}|     []|      false|        NaN|         NULL|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|female|         NULL|{+1 817 934 7142,...|     []|      false|        NaN|         NULL|2021-04-02 00:55:18|
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  3|    Orelie|      Penney|openney2@vistapri

## Composite sorting of a Spark Data Frame

In [13]:
courses = [{'course_id': 1,
  'course_name': '2020 Complete Python Bootcamp: From Zero to Hero in Python',
  'suitable_for': 'Beginner',
  'enrollment': 1100093,
  'stars': 4.6,
  'number_of_ratings': 318066},
 {'course_id': 4,
  'course_name': 'Angular - The Complete Guide (2020 Edition)',
  'suitable_for': 'Intermediate',
  'enrollment': 422557,
  'stars': 4.6,
  'number_of_ratings': 129984},
 {'course_id': 12,
  'course_name': 'Automate the Boring Stuff with Python Programming',
  'suitable_for': 'Advanced',
  'enrollment': 692617,
  'stars': 4.6,
  'number_of_ratings': 70508},
 {'course_id': 10,
  'course_name': 'Complete C# Unity Game Developer 2D',
  'suitable_for': 'Advanced',
  'enrollment': 364934,
  'stars': 4.6,
  'number_of_ratings': 78989},
 {'course_id': 5,
  'course_name': 'Java Programming Masterclass for Software Developers',
  'suitable_for': 'Advanced',
  'enrollment': 502572,
  'stars': 4.6,
  'number_of_ratings': 123798},
 {'course_id': 15,
  'course_name': 'Learn Python Programming Masterclass',
  'suitable_for': 'Advanced',
  'enrollment': 240790,
  'stars': 4.5,
  'number_of_ratings': 58677},
 {'course_id': 3,
  'course_name': 'Machine Learning A-Z™: Hands-On Python & R In Data Science',
  'suitable_for': 'Intermediate',
  'enrollment': 692812,
  'stars': 4.5,
  'number_of_ratings': 132228},
 {'course_id': 14,
  'course_name': 'Modern React with Redux [2020 Update]',
  'suitable_for': 'Intermediate',
  'enrollment': 203214,
  'stars': 4.7,
  'number_of_ratings': 60835},
 {'course_id': 8,
  'course_name': 'Python for Data Science and Machine Learning Bootcamp',
  'suitable_for': 'Intermediate',
  'enrollment': 387789,
  'stars': 4.6,
  'number_of_ratings': 87403},
 {'course_id': 6,
  'course_name': 'React - The Complete Guide (incl Hooks, React Router, Redux)',
  'suitable_for': 'Intermediate',
  'enrollment': 304670,
  'stars': 4.6,
  'number_of_ratings': 90964},
 {'course_id': 18,
  'course_name': 'Selenium WebDriver with Java -Basics to Advanced+Frameworks',
  'suitable_for': 'Advanced',
  'enrollment': 148562,
  'stars': 4.6,
  'number_of_ratings': 49947},
 {'course_id': 21,
  'course_name': 'Spring & Hibernate for Beginners (includes Spring Boot)',
  'suitable_for': 'Advanced',
  'enrollment': 177053,
  'stars': 4.6,
  'number_of_ratings': 45329},
 {'course_id': 7,
  'course_name': 'The Complete 2020 Web Development Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 270656,
  'stars': 4.7,
  'number_of_ratings': 88098},
 {'course_id': 9,
  'course_name': 'The Complete JavaScript Course 2020: Build Real Projects!',
  'suitable_for': 'Intermediate',
  'enrollment': 347979,
  'stars': 4.6,
  'number_of_ratings': 83521},
 {'course_id': 16,
  'course_name': 'The Complete Node.js Developer Course (3rd Edition)',
  'suitable_for': 'Advanced',
  'enrollment': 202922,
  'stars': 4.7,
  'number_of_ratings': 50885},
 {'course_id': 13,
  'course_name': 'The Complete Web Developer Course 2.0',
  'suitable_for': 'Intermediate',
  'enrollment': 273598,
  'stars': 4.5,
  'number_of_ratings': 63175},
 {'course_id': 11,
  'course_name': 'The Data Science Course 2020: Complete Data Science Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 325047,
  'stars': 4.5,
  'number_of_ratings': 76907},
 {'course_id': 20,
  'course_name': 'The Ultimate MySQL Bootcamp: Go from SQL Beginner to Expert',
  'suitable_for': 'Beginner',
  'enrollment': 203366,
  'stars': 4.6,
  'number_of_ratings': 45382},
 {'course_id': 2,
  'course_name': 'The Web Developer Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 596726,
  'stars': 4.6,
  'number_of_ratings': 182997},
 {'course_id': 19,
  'course_name': 'Unreal Engine C++ Developer: Learn C++ and Make Video Games',
  'suitable_for': 'Advanced',
  'enrollment': 229005,
  'stars': 4.5,
  'number_of_ratings': 45860},
 {'course_id': 17,
  'course_name': 'iOS 13 & Swift 5 - The Complete iOS App Development Bootcamp',
  'suitable_for': 'Advanced',
  'enrollment': 179598,
  'stars': 4.8,
  'number_of_ratings': 49972
}]

courses_df = spark.createDataFrame(courses)
courses_df.show()
courses_df.printSchema()
courses_df.count()

+---------+--------------------+----------+-----------------+-----+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|
+---------+--------------------+----------+-----------------+-----+------------+
|        1|2020 Complete Pyt...|   1100093|           318066|  4.6|    Beginner|
|        4|Angular - The Com...|    422557|           129984|  4.6|Intermediate|
|       12|Automate the Bori...|    692617|            70508|  4.6|    Advanced|
|       10|Complete C# Unity...|    364934|            78989|  4.6|    Advanced|
|        5|Java Programming ...|    502572|           123798|  4.6|    Advanced|
|       15|Learn Python Prog...|    240790|            58677|  4.5|    Advanced|
|        3|Machine Learning ...|    692812|           132228|  4.5|Intermediate|
|       14|Modern React with...|    203214|            60835|  4.7|Intermediate|
|        8|Python for Data S...|    387789|            87403|  4.6|Intermediate|
|        6|React - The Compl

21

In [8]:
courses_df.sort('suitable_for', 'enrollment').show()
courses_df.sort(courses_df['suitable_for'], courses_df['enrollment']).show()
courses_df.sort(['suitable_for', 'enrollment']).show()

+---------+--------------------+----------+-----------------+-----+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|
+---------+--------------------+----------+-----------------+-----+------------+
|       18|Selenium WebDrive...|    148562|            49947|  4.6|    Advanced|
|       21|Spring & Hibernat...|    177053|            45329|  4.6|    Advanced|
|       17|iOS 13 & Swift 5 ...|    179598|            49972|  4.8|    Advanced|
|       16|The Complete Node...|    202922|            50885|  4.7|    Advanced|
|       19|Unreal Engine C++...|    229005|            45860|  4.5|    Advanced|
|       15|Learn Python Prog...|    240790|            58677|  4.5|    Advanced|
|       10|Complete C# Unity...|    364934|            78989|  4.6|    Advanced|
|        5|Java Programming ...|    502572|           123798|  4.6|    Advanced|
|       12|Automate the Bori...|    692617|            70508|  4.6|    Advanced|
|       20|The Ultimate MySQ

In [12]:
courses_df.sort(courses_df['suitable_for'], courses_df['number_of_ratings'].desc()).show()

from pyspark.sql.functions import desc
courses_df.sort('suitable_for', desc('number_of_ratings')).show()

courses_df.sort('suitable_for', 'number_of_ratings', ascending=[1, 0]).show()
courses_df.sort(['suitable_for', 'number_of_ratings'], ascending=[1, 0]).show()

+---------+--------------------+----------+-----------------+-----+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|
+---------+--------------------+----------+-----------------+-----+------------+
|        5|Java Programming ...|    502572|           123798|  4.6|    Advanced|
|       10|Complete C# Unity...|    364934|            78989|  4.6|    Advanced|
|       12|Automate the Bori...|    692617|            70508|  4.6|    Advanced|
|       15|Learn Python Prog...|    240790|            58677|  4.5|    Advanced|
|       16|The Complete Node...|    202922|            50885|  4.7|    Advanced|
|       17|iOS 13 & Swift 5 ...|    179598|            49972|  4.8|    Advanced|
|       18|Selenium WebDrive...|    148562|            49947|  4.6|    Advanced|
|       19|Unreal Engine C++...|    229005|            45860|  4.5|    Advanced|
|       21|Spring & Hibernat...|    177053|            45329|  4.6|    Advanced|
|        1|2020 Complete Pyt

In [25]:
from pyspark.sql.functions import col, when

course_level = when(col('suitable_for')=='Beginner', 0).otherwise(when(col('suitable_for')=='Intermediate', 1).otherwise(2))

courses_df. \
    sort(course_level, col('number_of_ratings').desc()). \
    show()

+---------+--------------------+----------+-----------------+-----+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|
+---------+--------------------+----------+-----------------+-----+------------+
|        1|2020 Complete Pyt...|   1100093|           318066|  4.6|    Beginner|
|        2|The Web Developer...|    596726|           182997|  4.6|    Beginner|
|        7|The Complete 2020...|    270656|            88098|  4.7|    Beginner|
|       11|The Data Science ...|    325047|            76907|  4.5|    Beginner|
|       20|The Ultimate MySQ...|    203366|            45382|  4.6|    Beginner|
|        3|Machine Learning ...|    692812|           132228|  4.5|Intermediate|
|        4|Angular - The Com...|    422557|           129984|  4.6|Intermediate|
|        6|React - The Compl...|    304670|            90964|  4.6|Intermediate|
|        8|Python for Data S...|    387789|            87403|  4.6|Intermediate|
|        9|The Complete Java

In [27]:
courses_df. \
    withColumn('course_level', when(col('suitable_for')=='Beginner', 0).
                               when(col('suitable_for')=='Intermediate', 1).
                               otherwise(2)). \
    sort('course_level', col('number_of_ratings').desc()). \
    show()

+---------+--------------------+----------+-----------------+-----+------------+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|course_level|
+---------+--------------------+----------+-----------------+-----+------------+------------+
|        1|2020 Complete Pyt...|   1100093|           318066|  4.6|    Beginner|           0|
|        2|The Web Developer...|    596726|           182997|  4.6|    Beginner|           0|
|        7|The Complete 2020...|    270656|            88098|  4.7|    Beginner|           0|
|       11|The Data Science ...|    325047|            76907|  4.5|    Beginner|           0|
|       20|The Ultimate MySQ...|    203366|            45382|  4.6|    Beginner|           0|
|        3|Machine Learning ...|    692812|           132228|  4.5|Intermediate|           1|
|        4|Angular - The Com...|    422557|           129984|  4.6|Intermediate|           1|
|        6|React - The Compl...|    304670|            90964

In [32]:
from pyspark.sql.functions import expr

column_level = expr("""
case when suitable_for='Beginner' then 0
     when suitable_for='Imtermediate' then 1
     else 2
end
""")
courses_df.sort(column_level, 'number_of_ratings', ascending=[1, 0]).show()

+---------+--------------------+----------+-----------------+-----+------------+
|course_id|         course_name|enrollment|number_of_ratings|stars|suitable_for|
+---------+--------------------+----------+-----------------+-----+------------+
|        1|2020 Complete Pyt...|   1100093|           318066|  4.6|    Beginner|
|        2|The Web Developer...|    596726|           182997|  4.6|    Beginner|
|        7|The Complete 2020...|    270656|            88098|  4.7|    Beginner|
|       11|The Data Science ...|    325047|            76907|  4.5|    Beginner|
|       20|The Ultimate MySQ...|    203366|            45382|  4.6|    Beginner|
|        3|Machine Learning ...|    692812|           132228|  4.5|Intermediate|
|        4|Angular - The Com...|    422557|           129984|  4.6|Intermediate|
|        5|Java Programming ...|    502572|           123798|  4.6|    Advanced|
|        6|React - The Compl...|    304670|            90964|  4.6|Intermediate|
|        8|Python for Data S