In [None]:
# common imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# pandas imports
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

# display setup
pd.set_option("display.max_columns", None) # the None parameter displays unlimited columns
sns.set(style="whitegrid") # for plots

## 1. Getting the Data

In [None]:
# read the csv file
df = pd.read_csv("udemy_courses.csv")

In [None]:
# display the first 5 rows for a quick look
df.head()

In [None]:
# DataFrame shape (rows, columns)
# understand the amount of data we are working with
df.shape

In [None]:
# description of data
df.info()

In [None]:
# check if there are null values
df.isna().sum()

In [None]:
# summary of the numerical attributes
df.describe()

> As shown above, there are no missing values which is excellent!
>
> *It is vital to understand the features we are working with.*
> ### Features in the DataFrame:
>> 1. course_id: Course identification number
>> 2. course_title: Title of course
>> 3. url: Course URL
>> 4. is_paid: True if the course costs money, False if the course is free
>> 5. price: Price of course
>> 6. num_subscribers: Number of subscribers for the course
>> 7. num_lectures: Number of lectures in the course
>> 8. level: Difficulty level of the course
>> 9. content_duration: Duration of all course materials
>> 10. published_timestamp: Course publication date
>> 11. subject: Subject of course

In [None]:
# a histogram plot for each numerical attribute
df.drop("is_paid", axis=1).hist(bins=30, figsize=(20,15))
# plt.tight_layout()
plt.show()

> Initial observations from the histograms:
>> 1. Most course durations are between 0-5 hours.
>> 2. There are usually around 1-50 lectures per course.
>> 3. Courses tend to have few reviews. There are probably a handful of courses
>> with a large amount of reviews since the X axis goes up to 25000 while over 3000
>> instances are represented in the first bin.
>> 4. The majority of courses are in the same range of subscribers. The instances farther up
>> the scale were probably more successful or perhaps courses on a trending topic.
>> 5. Assuming the prices are in USD, the range is between 0-250 dollars.
>> The plot shows the most common price roughly $25.
>>
>> It would probably be a good idea to look further into some of these values.
>> For instance, if the content duration or number of lectures is listed as 0.

> ### Objective:
> #### Predicting the number of subscribers for a course.
>> ##### Chosen Feature:
>> *num_subscribers column*
>>> The column represents how many people have subscribed for each individual course.
>>> ##### Motive:
>>> Predicting the number of people interested in a course. The more subscribers, the
>>> more popular the course.

> #### Splitting the Data:
>> Before further analysis let's split the data into a training set and a testing set.
>> This will ensure avoidance of bias that could occur from learning the data as a whole.

In [None]:
# use sklearn train_test_split function to split the data
# the random state parameter ensures that data will be shuffled and split the same way in each run
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [None]:
print("Number of instances in training set: ", len(train_set))
print("Number of instances in testing set: ", len(test_set))

## 2. Understanding and Visualizing the Data
> ##### *The motivation for this section is to gain more insights*

In [None]:
# deep copy of the training set
df2 = train_set.copy()

In [None]:
df2.head(2)

> ## Exploring Attribute Combinations

In [None]:
# method creates a correlations matrix
corr_matrix = df2.corr()

In [None]:
# looking at attributes correlation with num_subscribers feature
corr_matrix["num_subscribers"].sort_values(ascending=False)

In [None]:
# a histogram plot for attributes with a high correlation

attributes = ["num_subscribers", "num_reviews", "num_lectures",
              "content_duration", "course_id"]

scatter_matrix(df2[attributes], figsize=(12,8))
plt.tight_layout()
plt.show()

In [None]:
# scatter plot of the strongest correlation in the corr matrix
# the alpha is set to show the distribution more clearly
df2.plot(kind="scatter", x="num_reviews", y="num_subscribers", alpha=0.1,
         color='b', figsize=(10,5))
plt.title("Reviews and Subscribers Correlation", size=20)
plt.xlabel("num_reviews", size=15)
plt.ylabel("num_subscribers", size=15)
plt.tight_layout()
plt.show()

> #### Correlations with num_subscribers Attribute- Overview:
> The strongest positive correlations (0.1 or more) are:
> * num_reviews
> * num_lectures
> * content_duration
>
> The strongest negative correlations (-0.1 or less) are:
> * course_id
> * is_paid

> ### Examining Course ID Feature

In [None]:
print("Number of unique course IDs:", df2["course_id"].nunique())
print("Length of DataFrame:", len(df2))

> Since there is a unique value for almost every course ID, the correlation probably
> coincidental.

In [None]:
# show duplicated listings
df2[df2.duplicated("course_id")]

In [None]:
# remove duplicated listings
df2.drop_duplicates(inplace=True)

In [None]:
# examine changes
df2.shape

> ### Overview:
> * The course ID is unique for each course.
> * This column should be removed when training a model in order to generalize better.

> ### Assessing Price Features

In [None]:
# evaluate current values in column
df2["is_paid"].head(10)

In [None]:
# use encoder to convert "is_paid" column to binary outcome
ordinal_encoder = OrdinalEncoder(dtype=int)
df2["is_paid"] = ordinal_encoder.fit_transform(df2[["is_paid"]])

In [None]:
# evaluate changes
df2["is_paid"].head(10)

In [None]:
# 0 is False, 1 is True
ordinal_encoder.categories_

In [None]:
# count number of instances for each outcome
df2["is_paid"].value_counts()

In [None]:
# use groupby for price attribute
price_values = df2.groupby("price")

In [None]:
# check if number of free courses matches when the price is 0
price_values_0 = price_values.get_group(0)
price_values_0.shape

In [None]:
# plot of free and paid courses
plt.figure(figsize=(10,5))
sns.countplot(x=df2["is_paid"])
plt.title("Free and Paid Courses", size=20)
plt.xlabel("is_paid", size = 15)
plt.ylabel("count", size=15)
plt.tight_layout()
plt.show()

In [None]:
# course price values sorted by prices
df2["price"].value_counts().sort_index()

In [None]:
# top ten course price values sorted by value counts
prices_top10 = df2["price"].value_counts().sort_values(ascending=False).head(10)

In [None]:
# calculate percentage of instances per price in data
prices_percent_in_data = []
num_subscribed = []

for i in range(len(prices_top10.index)):
    prices_percent_in_data.append(round((prices_top10.values[i]/len(df2))*100,2))
    num_subscribed.append(price_values.get_group(prices_top10.index[i])["num_subscribers"].sum())

In [None]:
# create a DataFrame with the results
prices_top10_dict = {"price": prices_top10.index, "number_of_instances": prices_top10.values,
                     "% of data": prices_percent_in_data, "num_subscribers": num_subscribed}
prices_top10_df = pd.DataFrame(prices_top10_dict, index=range(1,11))
prices_top10_df

In [None]:
# plot of top 10 common prices by amount of subscribers
plt.figure(figsize=(10,5))
sns.barplot(x=prices_top10_df["price"], y=prices_top10_df["num_subscribers"])
plt.xlabel("price", size=15)
plt.ylabel("num_subscribers\n(millions)", size=15)
plt.title("Top 10 Common Prices by Subscribers", size=20)
plt.tight_layout()
plt.show()

In [None]:
# plot of content duration by free or paid course
plt.figure(figsize=(10,5))
sns.scatterplot(x=df2["content_duration"], y=df2["is_paid"], alpha=0.1)
plt.title("Content Duration by Type of Course Payment", size=20)
plt.xlabel("content_duration", size=15)
plt.ylabel("is_paid", size=15)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df2["subject"], y=df2["num_subscribers"], hue=df2["level"])


> ### Observations:
> * As speculated earlier in the initial observations,$20 is the most common price for a course.
> * The number of listings with the price $0 matches the number of instances that were
> labeled "False" in the is_paid column.
> * The prices listed tend to increase by 5 dollars until they reach the maximum price
> which is $200.
> * Among the 10 most common prices in the data, most are subscribed to the free courses.
> * Content duration is longer for paid courses.

> ### Researching Level and Subject Feature

In [None]:
# count number of instances for each level
level_values = df2["level"].value_counts()
level_values

In [None]:
# count number of instances for each subject
subject_values = df2["subject"].value_counts()
subject_values

In [None]:
# pie plot of course levels and subjects in data
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].pie(level_values, startangle=180, labels=level_values.index, autopct="%1.1f%%")
ax[0].set_title("Course Levels", size=20)
ax[1].pie(subject_values, startangle=180, labels=subject_values.index, autopct="%1.1f%%")
ax[1].set_title("Course Subjects", size=20)
plt.tight_layout()
plt.show()

In [None]:
# scatter plot of price by course level
plt.figure(figsize=(10,5))
sns.scatterplot(y=df2["level"], x=df2["price"], alpha=0.1)
plt.title("Price by Course Level", size=20)
plt.xlabel("price", size=15)
plt.ylabel("level", size=15)
plt.tight_layout()
plt.show()

In [None]:
# plot subject by number of subscribers and level
# the black bars represent the error
plt.figure(figsize=(10,5))
sns.barplot(x=df2["subject"], y=df2["num_subscribers"], hue=df2["level"])
plt.title("Subject by Number of Subscribers and Level", size=20)
plt.xlabel("subject", size=15)
plt.ylabel("num_subscribers\n(millions)", size=15)
plt.tight_layout()
plt.show()

> ### Observations:
> * All Levels is the most common level, representing over 50%.
> * Web Development is the most common subject, and Business Finance is second with
> approximately 1% difference respectively.
> * Price variations according to the level of the course also show that Expert is
> the least common level in the data. It is also the only level that does not
> provide free courses. The other levels are dispersed more frequently
> throughout the line.
> * Web Development courses are significantly higher in subscribers than the other subjects.
> Since Business Finance falls shortly behind in content, it is likely that people are more
> interested in studying Web Development courses.

> ### Analyzing Additional Columns

In [None]:
# examine current shape
df2.shape

In [None]:
# every course has a unique URL
df2["url"].nunique()

In [None]:
# some courses have an identical title
df2["course_title"].nunique()

In [None]:
# find duplicated instances
# false marks all duplicates as true
title_df = df2[df2.duplicated("course_title", keep=False)].copy()
# show duplicated titles
title_df["course_title"].unique()

In [None]:
# groupy course title
title = title_df.groupby("course_title")

In [None]:
# examining one of the duplicated courses
# the courses have the same name and different values for some features
title.get_group("Acoustic Blues Guitar Lessons")