In [1]:
# Import libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Acquire the data

In [2]:
# Load the curriculum access

df_log = pd.read_csv('anonymized-curriculum-access.txt',          
                      engine='python',
                      header=None,
                      index_col=False,
                      sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                      na_values='"-"',
)

df_log.head()

Unnamed: 0,0,1,2,3,4,5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [33]:
df_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           719459 non-null  object 
 1   time           719459 non-null  object 
 2   page_accessed  719458 non-null  object 
 3   user_id        719459 non-null  int64  
 4   cohort_id      674619 non-null  float64
 5   ip             719459 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.9+ MB


In [7]:
# Load the cohort info

df_cohort = pd.read_csv('cohorts.csv')
df_cohort.head()

Unnamed: 0,cohort_id,name,start_date,end_date,program_id
0,1,Arches,2014-02-04,2014-04-22,1
1,2,Badlands,2014-06-04,2014-08-22,1
2,3,Carlsbad,2014-09-04,2014-11-05,1
3,4,Denali,2014-10-20,2015-01-18,1
4,5,Everglades,2014-11-18,2015-02-24,1


In [8]:
df_cohort.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cohort_id   46 non-null     int64 
 1   name        46 non-null     object
 2   start_date  46 non-null     object
 3   end_date    46 non-null     object
 4   program_id  46 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ KB


### Preapre the datasets for merge

In [3]:
# Rename the columns in df_log

df_log.columns = ['date', 'time', 'page_accessed', 'user_id', 'cohort_id', 'ip']
df_log.head()

Unnamed: 0,date,time,page_accessed,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [5]:
# How many unique cohort_id in df_log?
df_log.cohort_id.nunique()  # only 40 unique cohort ids

40

In [10]:
# How many unique cohorts in df_cohort? should be 40
df_cohort.cohort_id.nunique() # 46 unique cohort_ids

46

In [28]:
# Which 6 cohort_ids are missing in df_log?

log_cohort_ids = df_log.cohort_id.value_counts().index
cohort_ids = df_cohort.cohort_id.value_counts().index
set(cohort_ids) - set(log_cohort_ids)

{3, 10, 20, 30, 60, 132}

In [9]:
# How many unique program_id in df_cohort?
df_cohort.program_id.value_counts()

2    26
1    14
3     4
4     2
Name: program_id, dtype: int64

**Takeaways**:
- df_cohort is pretty clean and don't need a lot of cleanining. 
- do I need to convert the start and end date to timestamp?

### Merge two datasets

In [11]:
# Print out the shapes of the dataframes
df_log.shape, df_cohort.shape

((719459, 6), (46, 5))

In [13]:
# Left join the two datasets on cohort ids
df = df_log.merge(df_cohort, how='left', on='cohort_id')

# Print the shape
df.shape

(719459, 10)

In [15]:
# Take a peek at the new dataframe
df.sample(5) # Matched

Unnamed: 0,date,time,page_accessed,user_id,cohort_id,ip,name,start_date,end_date,program_id
192321,2019-02-06,11:09:35,javascript-i/introduction/working-with-data-ty...,344,29.0,97.105.19.58,Zion,2019-01-22,2019-06-04,2.0
667531,2020-09-11,13:31:55,toc,736,,23.116.170.48,,,,
666111,2020-09-10,14:53:43,mysql/relationships/sub-queries,670,58.0,65.43.116.84,Hyperion,2020-05-26,2020-11-10,2.0
531080,2020-05-03,17:13:47,javascript-i/mapbox-api,464,31.0,104.182.14.234,Andromeda,2019-03-18,2019-07-30,2.0
491928,2020-03-24,11:35:01,java-i/methods,570,53.0,172.124.67.93,Fortuna,2020-01-13,2020-06-23,2.0


In [29]:
# Quick check: there should be 40 uniuqe cohort ids in the new dataframe
df.cohort_id.nunique() # matched

40

### Handle date and time columns
- Combines access date and time and set it as the index
- Convert the start and end dates to datetime dtypes

In [30]:
# Create a new column for timestamp
df['timestamp'] = df.date.str.cat(df.time, sep=' ')

# Conver to datetime dtype
df.timestamp = pd.to_datetime(df.timestamp)

# Set the timestamp as index
df = df.set_index('timestamp').sort_index()

# Drop columns: date and time
df.drop(columns=['date', 'time'], inplace=True)

# Take peek at the df
df.head()

Unnamed: 0_level_0,page_accessed,user_id,cohort_id,ip,name,start_date,end_date,program_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2.0


In [31]:
# Print the dtypes of the colunms
df.dtypes

page_accessed     object
user_id            int64
cohort_id        float64
ip                object
name              object
start_date        object
end_date          object
program_id       float64
dtype: object

In [32]:
# Convert the start and end dates to datetime dtpye

df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)

# Print the dtypes
df.dtypes

page_accessed            object
user_id                   int64
cohort_id               float64
ip                       object
name                     object
start_date       datetime64[ns]
end_date         datetime64[ns]
program_id              float64
dtype: object

### Break down the page information to lessons

In [None]:
# Creat