In [49]:
# Import libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Acquire the data

In [50]:
# Load the curriculum access

df_log = pd.read_csv('anonymized-curriculum-access.txt',          
                      engine='python',
                      header=None,
                      index_col=False,
                      sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                      na_values='"-"',
)

df_log.head()

Unnamed: 0,0,1,2,3,4,5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [51]:
df_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       719459 non-null  object 
 1   1       719459 non-null  object 
 2   2       719458 non-null  object 
 3   3       719459 non-null  int64  
 4   4       674619 non-null  float64
 5   5       719459 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.9+ MB


In [52]:
# Load the cohort info

df_cohort = pd.read_csv('cohorts.csv')
df_cohort.head()

Unnamed: 0,cohort_id,name,start_date,end_date,program_id
0,1,Arches,2014-02-04,2014-04-22,1
1,2,Badlands,2014-06-04,2014-08-22,1
2,3,Carlsbad,2014-09-04,2014-11-05,1
3,4,Denali,2014-10-20,2015-01-18,1
4,5,Everglades,2014-11-18,2015-02-24,1


In [53]:
df_cohort.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cohort_id   46 non-null     int64 
 1   name        46 non-null     object
 2   start_date  46 non-null     object
 3   end_date    46 non-null     object
 4   program_id  46 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ KB


### Preapre the datasets for merge

In [54]:
# Rename the columns in df_log

df_log.columns = ['date', 'time', 'page_accessed', 'user_id', 'cohort_id', 'ip']
df_log.head()

Unnamed: 0,date,time,page_accessed,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [55]:
# How many unique cohort_id in df_log?
df_log.cohort_id.nunique()  # only 40 unique cohort ids

40

In [56]:
# How many unique cohorts in df_cohort? should be 40
df_cohort.cohort_id.nunique() # 46 unique cohort_ids

46

In [57]:
# Which 6 cohort_ids are missing in df_log?

log_cohort_ids = df_log.cohort_id.value_counts().index
cohort_ids = df_cohort.cohort_id.value_counts().index
set(cohort_ids) - set(log_cohort_ids)

{3, 10, 20, 30, 60, 132}

In [58]:
# How many unique program_id in df_cohort?
df_cohort.program_id.value_counts()

2    26
1    14
3     4
4     2
Name: program_id, dtype: int64

**Takeaways**:
- df_cohort is pretty clean and don't need a lot of cleanining. 
- The access log is missing 6 cohort ids, so I will use left join when merging them. 
- There are 4 program ids. I will categorize them into two groups: 
    - data science: 3
    - web development: 1, 2, 4

### Merge two datasets

In [59]:
# Print out the shapes of the dataframes
df_log.shape, df_cohort.shape

((719459, 6), (46, 5))

In [60]:
# Left join the two datasets on cohort ids
df = df_log.merge(df_cohort, how='left', on='cohort_id')

# Print the shape
df.shape

(719459, 10)

In [61]:
# Take a peek at the new dataframe
df.sample(5)

Unnamed: 0,date,time,page_accessed,user_id,cohort_id,ip,name,start_date,end_date,program_id
291489,2019-07-02,10:54:27,search/search_index.json,393,31.0,97.105.19.58,Andromeda,2019-03-18,2019-07-30,2.0
375904,2019-10-18,15:17:34,java-iii,64,28.0,97.105.19.58,Staff,2014-02-04,2014-02-04,2.0
15668,2018-02-27,12:07:50,jquery,30,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2.0
100766,2018-08-08,14:13:05,javascript-i/functions,235,25.0,97.105.19.58,Wrangell,2018-07-23,2018-11-29,2.0
459138,2020-02-18,09:55:24,javascript-i/bom-and-dom/bom,570,53.0,97.105.19.58,Fortuna,2020-01-13,2020-06-23,2.0


In [62]:
# Quick check: there should be 40 uniuqe cohort ids in the new dataframe
df.cohort_id.nunique() # matched

40

### Handle date and time columns
- Combines access date and time and set it as the index
- Convert the start and end dates to datetime dtypes

In [63]:
# Create a new column for timestamp
df['timestamp'] = df.date.str.cat(df.time, sep=' ')

# Conver to datetime dtype
df.timestamp = pd.to_datetime(df.timestamp)

# Set the timestamp as index
df = df.set_index('timestamp').sort_index()

# Drop columns: date and time
df.drop(columns=['date', 'time'], inplace=True)

# Take peek at the df
df.head()

Unnamed: 0_level_0,page_accessed,user_id,cohort_id,ip,name,start_date,end_date,program_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2.0


In [64]:
# Print the dtypes of the colunms
df.dtypes

page_accessed     object
user_id            int64
cohort_id        float64
ip                object
name              object
start_date        object
end_date          object
program_id       float64
dtype: object

In [65]:
# Convert the start and end dates to datetime dtpye

df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)

# Print the dtypes
df.dtypes

page_accessed            object
user_id                   int64
cohort_id               float64
ip                       object
name                     object
start_date       datetime64[ns]
end_date         datetime64[ns]
program_id              float64
dtype: object

### Break down the page information to lessons

In [66]:
# Create a variable to hold page_accessed
df_pages = df.page_accessed
df_pages.head()

timestamp
2018-01-26 09:55:03                                      /
2018-01-26 09:56:02                                java-ii
2018-01-26 09:56:05    java-ii/object-oriented-programming
2018-01-26 09:56:06     slides/object_oriented_programming
2018-01-26 09:56:24              javascript-i/conditionals
Name: page_accessed, dtype: object

In [67]:
# Replace / with homepage
df_pages.replace('/', 'homepage', inplace=True)

In [68]:
# Split the url by the first '/' and expand to two columns
df_pages = df_pages.str.split('/', n=1, expand=True)

# Take a peek at the new df
df_pages.head()

Unnamed: 0_level_0,0,1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-26 09:55:03,homepage,
2018-01-26 09:56:02,java-ii,
2018-01-26 09:56:05,java-ii,object-oriented-programming
2018-01-26 09:56:06,slides,object_oriented_programming
2018-01-26 09:56:24,javascript-i,conditionals


In [69]:
# Change the columns names
df_pages.columns = ['lesson', 'lesson_detail']

In [70]:
# Concat the lesson columns to the original dataframe
df = pd.concat([df, df_pages], axis=1)

# Drop the column page_accessed and the lesson details
df.drop(columns=['page_accessed', 'lesson_detail'], inplace=True)

# Print the shape
df.shape

(719459, 8)

In [71]:
# Take a peek at the new dataframe
df.head()

Unnamed: 0_level_0,user_id,cohort_id,ip,name,start_date,end_date,program_id,lesson
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-26 09:55:03,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,homepage
2018-01-26 09:56:02,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,java-ii
2018-01-26 09:56:05,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,java-ii
2018-01-26 09:56:06,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,slides
2018-01-26 09:56:24,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2.0,javascript-i


In [72]:
# Quick summary
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 719459 entries, 2018-01-26 09:55:03 to 2020-11-02 16:48:47
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   user_id     719459 non-null  int64         
 1   cohort_id   674619 non-null  float64       
 2   ip          719459 non-null  object        
 3   name        674619 non-null  object        
 4   start_date  674619 non-null  datetime64[ns]
 5   end_date    674619 non-null  datetime64[ns]
 6   program_id  674619 non-null  float64       
 7   lesson      719458 non-null  object        
dtypes: datetime64[ns](2), float64(2), int64(1), object(3)
memory usage: 49.4+ MB


**Takeaways**
- Null values are found in the columns below:
    - cohort_id
    - cohort_name
    - start_date
    - end_date
    - program_id
    - lesson
- Fill the non-datetime missing values with 0 or string 'zero'.
- Fill the missing values in start_date with the earliest dateteim.
- Fill the missing values in end_date with the lastest datetime.
- Fill the only 1 missing values in lesson with 'homepage'.

### Handle missing values

#### Handling missing values in cohort id

In [73]:
# Print the number of missing values in cohort_ids
df.cohort_id.isna().sum()

44840

In [74]:
# Fill the missing values in cohort_ids as 0
df.cohort_id.fillna(0, inplace=True)

# Print the number of missing values in cohort_ids
df.cohort_id.isna().sum()

0

#### Handling missing values in cohort names

In [75]:
# Print the number of missing values in cohort names
df.name.isna().sum()

44840

In [76]:
# Fill the missing values in cohort names as zero
df.name.fillna('zero', inplace=True)

# Print the number of missing values in cohort names
df.name.isna().sum()

0

#### Handling missing values in program id

In [77]:
# Print the number of missing values in program id
df.program_id.isna().sum()

44840

In [78]:
# Fill the missing values in program id with 0
df.program_id.fillna(0, inplace=True)

# Print the number of missing values in program id
df.program_id.isna().sum()

0

#### Handling missing values in lesson

In [79]:
# Print the number of missing values in lesson
df.lesson.isna().sum()

1

In [80]:
# Fill the missing values in lesson with homepage
df.lesson.fillna('homepage', inplace=True)

# Print the number of missing values in lesson
df.lesson.isna().sum()

0

#### Handling missing values in start and end date

In [81]:
# Print the date range of the dataset
df.index.min(), df.index.max()

(Timestamp('2018-01-26 09:55:03'), Timestamp('2020-11-02 16:48:47'))

In [82]:
# Print the number of missing values in start date
df.start_date.isna().sum()

44840

In [83]:
# Fill the missing values in start_date with the earliest datetime
df.start_date.fillna(pd.Timestamp('2018-01-26 09:55:03'), inplace=True)

# Print the number of missing values in start_date
df.start_date.isna().sum()

0

In [84]:
# Print the number of missing values in end date
df.end_date.isna().sum()

44840

In [85]:
# Fill the missing values in end_date with the most recent datetime
df.end_date.fillna(pd.Timestamp('2020-11-02 16:48:47'), inplace=True)

# Print the number of missing values in end_date
df.end_date.isna().sum()

0

In [86]:
# Print the quick summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 719459 entries, 2018-01-26 09:55:03 to 2020-11-02 16:48:47
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   user_id     719459 non-null  int64         
 1   cohort_id   719459 non-null  float64       
 2   ip          719459 non-null  object        
 3   name        719459 non-null  object        
 4   start_date  719459 non-null  datetime64[ns]
 5   end_date    719459 non-null  datetime64[ns]
 6   program_id  719459 non-null  float64       
 7   lesson      719459 non-null  object        
dtypes: datetime64[ns](2), float64(2), int64(1), object(3)
memory usage: 49.4+ MB


In [87]:
# Take a peek at the dataset
df.sample(5)

Unnamed: 0_level_0,user_id,cohort_id,ip,name,start_date,end_date,program_id,lesson
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-06-10 13:39:24,502,28.0,99.76.234.122,Staff,2014-02-04,2014-02-04,2.0,appendix
2019-07-12 03:56:41,421,32.0,172.127.182.225,Betelgeuse,2019-05-28,2019-10-08,2.0,javascript-i
2019-12-02 11:03:07,512,51.0,97.105.19.58,Deimos,2019-09-16,2020-02-27,2.0,java-ii
2018-05-30 13:30:03,124,23.0,97.105.19.61,Ulysses,2018-03-05,2018-07-19,2.0,slides
2018-02-21 22:10:50,23,22.0,108.65.244.91,Teddy,2018-01-08,2018-05-17,2.0,javascript-ii


**Takeaways**:
- All the missing values have been filled. 
- Now the dataset is ready for exploration and modeling. 
- I will build helper function in wrangle.py to acquire and prepare the raw data.