In [None]:
# common imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# pandas imports
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split

# display setup
pd.set_option("display.max_columns", None) # the None parameter displays unlimited columns
sns.set(style="whitegrid") # for plots

## 1. Getting the Data

In [None]:
# read the csv file
df = pd.read_csv("udemy_courses.csv")

In [None]:
# display the first 5 rows for a quick look
df.head()

In [None]:
# DataFrame shape (rows, columns)
# understand the amount of data we are working with
df.shape

In [None]:
# description of data
df.info()

In [None]:
# check if there are null values
df.isna().sum()

In [None]:
# summary of the numerical attributes
df.describe()

> As shown above, there are no missing values which is excellent!
>
> *It is vital to understand the features we are working with.*
> ### Features in the DataFrame:
>> 1. course_id: Course identification number
>> 2. course_title: Title of course
>> 3. url: Course URL
>> 4. is_paid: True if the course costs money, False if the course is free
>> 5. price: Price of course
>> 6. num_subscribers: Number of subscribers for the course
>> 7. num_lectures: Number of lectures in the course
>> 8. level: Difficulty level of the course
>> 9. content_duration: Duration of all course materials
>> 10. published_timestamp: Course publication date
>> 11. subject: Subject of course

In [None]:
# a histogram plot for each numerical attribute
df.drop("is_paid", axis=1).hist(bins=30, figsize=(20,15))
# plt.tight_layout()
plt.show()

> Initial observations from the histograms:
>> 1. Most course durations are between 0-5 hours.
>> 2. There are usually around 1-50 lectures per course.
>> 3. Courses tend to have few reviews. There are probably a handful of courses
>> with a large amount of reviews since the X axis goes up to 25000 while over 3000
>> instances are represented in the first bin.
>> 4. The majority of courses are in the same range of subscribers. The instances farther up
>> the scale were probably more successful or perhaps courses on a trending topic.
>> 5. Assuming the prices are in USD, the range is between 0-250 dollars.
>> The plot shows the most common price roughly $25.
>>
>> It would probably be a good idea to look further into some of these values.
>> For instance, if the content duration or number of lectures is listed as 0.

> ### Objective:
> #### Predicting the number of subscribers for a course.
>> ##### Chosen Feature:
>> *num_subscribers column*
>>> The column represents how many people have subscribed for each individual course.
>>> ##### Motive:
>>> Predicting the number of people interested in a course. The more subscribers, the
>>> more popular the course.

> #### Splitting the Data:
>> Before further analysis let's split the data into a training set and a testing set.
>> This will ensure avoidance of bias that could occur from learning the data as a whole.

In [None]:
# use sklearn train_test_split function to split the data
# the random state parameter ensures that data will be shuffled and split the same way in each run
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

In [None]:
print("Number of instances in training set: ", len(train_set))
print("Number of instances in testing set: ", len(test_set))

## 2. Understanding and Visualizing the Data
> ##### *The motivation for this section is to gain more insights*

In [None]:
# deep copy of the training set
df2 = train_set.copy()

In [None]:
df2.head(2)