In [1]:
# importing pyspark
import pyspark


In [2]:
# importing pandas and numpy
import pandas as pd
import numpy as np


In [3]:
# importing the spark session
from pyspark.sql import SparkSession


In [4]:
# creating the seasion
spark = SparkSession.builder.appName("Udemy Courses").getOrCreate()

22/10/20 12:22:35 WARN Utils: Your hostname, HP-G62 resolves to a loopback address: 127.0.1.1; using 192.168.18.113 instead (on interface enp3s0)
22/10/20 12:22:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/20 12:22:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/20 12:22:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
import pyspark.sql.functions as F 
from pyspark.sql import Window
import pyspark.sql.types as T

In [6]:
# Read JSON file into dataframe
udemy_course_df = spark.read.json("udemy_courses.json")

                                                                                

In [7]:
#  printing schema of dataframe
udemy_course_df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- content_duration: string (nullable = true)
 |-- course_id: string (nullable = true)
 |-- course_title: string (nullable = true)
 |-- is_paid: string (nullable = true)
 |-- level: string (nullable = true)
 |-- num_lectures: string (nullable = true)
 |-- num_reviews: string (nullable = true)
 |-- num_subscribers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- published_timestamp: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- url: string (nullable = true)



In [8]:
# showing the dataframe
udemy_course_df.show()

+---------------+------------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+--------------------+----------------+--------------------+
|_corrupt_record|  content_duration|course_id|        course_title|is_paid|             level|num_lectures|num_reviews|num_subscribers|price| published_timestamp|         subject|                 url|
+---------------+------------------+---------+--------------------+-------+------------------+------------+-----------+---------------+-----+--------------------+----------------+--------------------+
|           null|               1.5|  1070968|Ultimate Investme...|   True|        All Levels|          51|         23|           2147|  200|2017-01-18T20:58:58Z|Business Finance|https://www.udemy...|
|           null|              39.0|  1113822|Complete GST Cour...|   True|        All Levels|         274|        923|           2792|   75|2017-03-09T16:34:20Z|Business Finance|https://www.udemy

In [9]:
course_df=udemy_course_df.select(["course_id","course_title","subject","level","content_duration",
                                  "num_lectures","num_subscribers","num_reviews","is_paid","price"])
course_df.show()


+---------+--------------------+----------------+------------------+------------------+------------+---------------+-----------+-------+-----+
|course_id|        course_title|         subject|             level|  content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|
+---------+--------------------+----------------+------------------+------------------+------------+---------------+-----------+-------+-----+
|  1070968|Ultimate Investme...|Business Finance|        All Levels|               1.5|          51|           2147|         23|   True|  200|
|  1113822|Complete GST Cour...|Business Finance|        All Levels|              39.0|         274|           2792|        923|   True|   75|
|  1006314|Financial Modelin...|Business Finance|Intermediate Level|               2.5|          51|           2174|         74|   True|   45|
|  1210588|Beginner to Pro -...|Business Finance|        All Levels|               3.0|          36|           2451|         11|   True|   95|

In [10]:
course_df.count()

3668

In [11]:
type(course_df)

pyspark.sql.dataframe.DataFrame

In [12]:
#  making windows spec for partition by subject
windowSpec=Window.partitionBy("subject").orderBy(F.asc("num_subscribers"))

# 1. What are the best free courses by subject?
win_course_df=course_df.withColumn("Course List",F.collect_list(F.col("course_title")).over(windowSpec))\
    .filter(F.col("is_paid")=="False")\
    .filter(F.col("num_subscribers")>50000)\
    .filter(F.col("num_reviews")>3000)

    
win_course_df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+---------+--------------------+----------------+--------------+----------------+------------+---------------+-----------+-------+-----+--------------------+
|course_id|        course_title|         subject|         level|content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|         Course List|
+---------+--------------------+----------------+--------------+----------------+------------+---------------+-----------+-------+-----+--------------------+
|    48841|Accounting in 60 ...|Business Finance|Beginner Level|             1.5|          16|          56659|       4397|  False|    0|[Essentials of mo...|
|   173548|Build Your First ...| Web Development|Beginner Level|             3.0|          30|         120291|       5924|  False|    0|[The Ultimate Wor...|
|    41295|Learn HTML5 Progr...| Web Development|    All Levels|            10.5|          45|         268923|       8629|  False|    0|[The Ultimate Wor...|
|   314462|Quickstart AngularJS| Web Development|Beg

                                                                                

In [13]:
# 2. What are the most popular courses?
win_popular_course_df=course_df.withColumn("Popular Course",F.max(F.col("num_subscribers")).over(windowSpec))\
    .filter(F.col("num_subscribers")>50000)\
    .filter(F.col("num_reviews")>3000)

win_popular_course_df.show()

+---------+--------------------+-------------------+--------------+----------------+------------+---------------+-----------+-------+-----+--------------+
|course_id|        course_title|            subject|         level|content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|Popular Course|
+---------+--------------------+-------------------+--------------+----------------+------------+---------------+-----------+-------+-----+--------------+
|    48841|Accounting in 60 ...|   Business Finance|Beginner Level|             1.5|          16|          56659|       4397|  False|    0|         56659|
|   238934|Pianoforall - Inc...|Musical Instruments|    All Levels|            30.0|         362|          75499|       7676|   True|  200|         75499|
|   764164|The Complete Web ...|    Web Development|    All Levels|            30.5|         304|         114512|      22412|   True|  200|        114512|
|   173548|Build Your First ...|    Web Development|Beginner Level|   

In [14]:
# 3. List the courses that are specialized to “Business Finance” and find the average number of subscribers, 
# reviews, price and lectures on the subject.

win_special_course_df=course_df.withColumn("Course List",F.collect_list(F.col("course_title")).over(windowSpec))\
    .withColumn("average number of subscribers",F.avg(F.col("num_subscribers")).over(windowSpec))\
    .withColumn("average number of reviews",F.avg(F.col("num_reviews")).over(windowSpec))\
    .withColumn("average prices",F.avg(F.col("price")).over(windowSpec))\
    .withColumn("average lectures",F.avg(F.col("num_lectures")).over(windowSpec))\
    .filter(F.col("subject")=="Business Finance")

win_special_course_df.show()

[Stage 14:>                                                         (0 + 1) / 1]

+---------+--------------------+----------------+------------------+------------------+------------+---------------+-----------+-------+-----+--------------------+-----------------------------+-------------------------+--------------+----------------+
|course_id|        course_title|         subject|             level|  content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|         Course List|average number of subscribers|average number of reviews|average prices|average lectures|
+---------+--------------------+----------------+------------------+------------------+------------+---------------+-----------+-------+-----+--------------------+-----------------------------+-------------------------+--------------+----------------+
|   837322|Essentials of mon...|Business Finance|        All Levels|0.6166666666666667|          20|              0|          0|   True|   20|[Essentials of mo...|                          0.0|                      0.0|        53.375|          

                                                                                

In [16]:
# 5. Which courses offer the best cost benefit?
win_cost_benefit_course_df=course_df.filter(F.col("num_subscribers")>50000)\
    .filter(F.col("num_reviews")>3000)\
    .filter(F.col("price")<200)

win_cost_benefit_course_df.show()

+---------+--------------------+----------------+--------------+----------------+------------+---------------+-----------+-------+-----+
|course_id|        course_title|         subject|         level|content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|
+---------+--------------------+----------------+--------------+----------------+------------+---------------+-----------+-------+-----+
|    48841|Accounting in 60 ...|Business Finance|Beginner Level|             1.5|          16|          56659|       4397|  False|    0|
|   473160|Web Design for We...| Web Development|    All Levels|             3.0|          20|          98867|       6512|  False|    0|
|   314462|Quickstart AngularJS| Web Development|Beginner Level|             1.5|          17|          64128|       4047|  False|    0|
|   173548|Build Your First ...| Web Development|Beginner Level|             3.0|          30|         120291|       5924|  False|    0|
|    41295|Learn HTML5 Progr...| Web Deve

In [19]:
# 6. Find the courses which have more than 15 lectures.
course15_df=course_df.select(["course_title","num_lectures"]).filter(F.col("num_lectures")>15)
course15_df.show()

+--------------------+------------+
|        course_title|num_lectures|
+--------------------+------------+
|Ultimate Investme...|          51|
|Complete GST Cour...|         274|
|Financial Modelin...|          51|
|Beginner to Pro -...|          36|
|How To Maximize Y...|          26|
|Trading Penny Sto...|          25|
|Investing And Tra...|          26|
|Trading Stock Cha...|          23|
|Options Trading 3...|          38|
|Forex Trading Sec...|          76|
|Trading Options W...|          17|
|Financial Managem...|          19|
|Forex Trading Cou...|          16|
|Python Algo Tradi...|          42|
|Short Selling: Le...|          19|
|Basic Technical A...|          16|
|The Complete Char...|          52|
|7 Deadly Mistakes...|          23|
|Winning Forex Tra...|          25|
|Forex Traders - C...|          39|
+--------------------+------------+
only showing top 20 rows



In [20]:
# 7. Find the courses which have duration greater than 2 hours.
windowSpec=Window.partitionBy("level")

win_level_df=course_df.withColumn("Course List",F.collect_list(F.col("course_title")).over(windowSpec))\
    .filter(F.col("content_duration")>2)
win_level_df.show()

[Stage 20:>                                                         (0 + 1) / 1]                                                                                

+---------+--------------------+----------------+----------+----------------+------------+---------------+-----------+-------+-----+--------------------+
|course_id|        course_title|         subject|     level|content_duration|num_lectures|num_subscribers|num_reviews|is_paid|price|         Course List|
+---------+--------------------+----------------+----------+----------------+------------+---------------+-----------+-------+-----+--------------------+
|  1113822|Complete GST Cour...|Business Finance|All Levels|            39.0|         274|           2792|        923|   True|   75|[Ultimate Investm...|
|  1210588|Beginner to Pro -...|Business Finance|All Levels|             3.0|          36|           2451|         11|   True|   95|[Ultimate Investm...|
|   192870|Trading Penny Sto...|Business Finance|All Levels|             3.0|          25|           9221|        138|   True|  150|[Ultimate Investm...|
|   592338|Forex Trading Sec...|Business Finance|All Levels|             5.0