In [1]:
import unicodecsv as csv

# Data Analysis Process

### Step 1: Get data

In [2]:
# LONG WAY #
# enrollment_data = []
# file = open('enrollments.csv', 'rb')
# data = csv.DictReader(file)
# for i in data:
#     enrollment_data.append(i)
# file.close()

# SHORTER WAY #
# with open('enrollments.csv', 'rb') as file:
#     data = csv.DictReader(file)
#     enrollment_data = list(data)
    
# BEST WAY #
def read_from_csv(filename):
    with open(filename, 'rb') as file:
        data = csv.DictReader(file)
        return list(data)

enrollment_data = read_from_csv('enrollments.csv')
engagement_data = read_from_csv('engagements.csv')
submission_data = read_from_csv('submissions.csv')

In [3]:
enrollment_data[0]

{'account_key': '700',
 'status': 'canceled',
 'join_date': '2017-11-10',
 'cancel_date': '2017-11-16',
 'days_to_cancel': '6',
 'is_enrolled': 'FALSE',
 'is_canceled': 'TRUE'}

In [4]:
engagement_data[0]

{'acct': '0',
 'utc_date': '1/9/2018',
 'num_courses_visited': '1',
 'total_minutes_visited': '11.6793745',
 'lessons_completed': '0',
 'projects_completed': '0'}

In [5]:
submission_data[0]

{'creation_date': '1/14/2018',
 'completion_date': '1/16/2018',
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

Categorical Data - Any value that if you ran mathematical operation on it, you'd get nonsensical data

Ordinal Data - Any value that if you ran mathematical operation on it, you'd get sensical data

### Step 2: Clean Data

In [6]:
from datetime import datetime as dt

def convert_to_date(date_string):
    if date_string == '':
        return None
    return dt.strptime(date_string, '%Y-%m-%d')

def convert_to_int(int_string):
    if int_string == '':
        return None
    return int(int_string)

def convert_to_float(float_string):
    if float_string == '':
        return None
    return float(float_string)

def convert_to_bool(bool_string):
    if bool_string == '':
        return None
    elif bool_string == 'TRUE':
        return True
    elif bool_string == 'FALSE':
        return False

In [7]:
for r in enrollment_data:
    r['join_date'] = convert_to_date(r['join_date'])
    r['cancel_date'] = convert_to_date(r['cancel_date'])
    r['days_to_cancel'] = convert_to_int(r['days_to_cancel'])
    r['is_enrolled'] = convert_to_bool(r['is_enrolled'])
    r['is_canceled'] = convert_to_bool(r['is_canceled'])

In [8]:
enrollment_data[0]

{'account_key': '700',
 'status': 'canceled',
 'join_date': datetime.datetime(2017, 11, 10, 0, 0),
 'cancel_date': datetime.datetime(2017, 11, 16, 0, 0),
 'days_to_cancel': 6,
 'is_enrolled': False,
 'is_canceled': True}

In [13]:
for r in engagement_data:
    r['utc_date'] = convert_to_date(r['utc_date'])
    r['num_courses_visited'] = convert_to_int(r['num_courses_visited'])
    r['total_minutes_visited'] = convert_to_float(r['total_minutes_visited'])
    r['lessons_completed'] = convert_to_int(r['lessons_completed'])
    r['projects_completed'] = convert_to_int(r['projects_completed'])

ValueError: time data '1/9/2018' does not match format '%Y-%m-%d'

In [14]:
engagement_data[0]

{'acct': '0',
 'utc_date': '1/9/2018',
 'num_courses_visited': '1',
 'total_minutes_visited': '11.6793745',
 'lessons_completed': '0',
 'projects_completed': '0'}

In [15]:
for r in submission_data:
    r['creation_date'] = convert_to_date(r['creation_date'])
    r['completion_date'] = convert_to_date(r['completion_date'])

ValueError: time data '1/14/2018' does not match format '%Y-%m-%d'

In [16]:
submission_data[0]

{'creation_date': '1/14/2018',
 'completion_date': '1/16/2018',
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [17]:
# RUN ONLY ONCE!!!!!!!!!!!!!!!!!!!
for row in engagement_data:
    row['account_key'] = row['acct']
    del row['acct']

In [18]:
engagement_data[0]

{'utc_date': '1/9/2018',
 'num_courses_visited': '1',
 'total_minutes_visited': '11.6793745',
 'lessons_completed': '0',
 'projects_completed': '0',
 'account_key': '0'}

### Step 3: Answer Questions with Data

In [19]:
print(f"Enrollment records: {len(enrollment_data)}")
print(f"Engagement records: {len(engagement_data)}")
print(f"Submission records: {len(submission_data)}")

Enrollment records: 1640
Engagement records: 136240
Submission records: 3642


In [20]:
def get_unique_records(dataset, column_name):
    unique_data = set()
    for i in dataset:
        unique_data.add(i[column_name])
    return unique_data

unique_enrollments = get_unique_records(enrollment_data, 'account_key')
unique_engagements = get_unique_records(engagement_data, 'account_key')
unique_submissions = get_unique_records(submission_data, 'account_key')

In [21]:
print(f"Unique enrollment records: {len(unique_enrollments)}")
print(f"Unique engagement records: {len(unique_engagements)}")
print(f"Unique submission records: {len(unique_submissions)}")

Unique enrollment records: 1302
Unique engagement records: 1237
Unique submission records: 743


In [22]:
# unique_engagements

In [23]:
outliers = 0
for i in enrollment_data:
    if i['account_key'] not in unique_engagements and i['join_date'] != i['cancel_date']:
        outliers+=1
        print(i, end='\n\n')

{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2018, 1, 10, 0, 0), 'cancel_date': datetime.datetime(2018, 3, 10, 0, 0), 'days_to_cancel': 59, 'is_enrolled': True, 'is_canceled': True}

{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2018, 3, 10, 0, 0), 'cancel_date': datetime.datetime(2018, 6, 17, 0, 0), 'days_to_cancel': 99, 'is_enrolled': True, 'is_canceled': True}

{'account_key': '1101', 'status': 'current', 'join_date': datetime.datetime(2018, 2, 25, 0, 0), 'cancel_date': None, 'days_to_cancel': None, 'is_enrolled': True, 'is_canceled': False}



In [24]:
test_accounts = set()
for i in enrollment_data:
    if i['is_enrolled']:
        test_accounts.add(i['account_key'])
print(f'{len(test_accounts)} possible test accounts found')

6 possible test accounts found


In [25]:
def remove_test_accounts(dataset):
    records_to_remove = []
    for i in dataset:
        if i['account_key'] not in test_accounts:
            records_to_remove.append(i)
    return records_to_remove

In [26]:
# Remove data associated with test accounts across all datasets
true_enrollments = remove_test_accounts(enrollment_data)
true_engagements = remove_test_accounts(engagement_data)
true_submissions = remove_test_accounts(submission_data)

In [27]:
print(f"True enrollment records: {len(true_enrollments)}")
print(f"True engagement records: {len(true_engagements)}")
print(f"True submission records: {len(true_submissions)}")

True enrollment records: 1622
True engagement records: 135656
True submission records: 3634


<h4>Find quality accounts in which:</h4>
<ol>
    <li>The account is currently active i.e. is_canceled is False</li>
    <li>days_to_cancel is greater than 7 i.e. the trial period</li>
    <li>The latest vald record for each particular VALID user</li>
</ol>

In [28]:
true_enrollments[0]

{'account_key': '700',
 'status': 'canceled',
 'join_date': datetime.datetime(2017, 11, 10, 0, 0),
 'cancel_date': datetime.datetime(2017, 11, 16, 0, 0),
 'days_to_cancel': 6,
 'is_enrolled': False,
 'is_canceled': True}

In [29]:
# What we want accounts_in_good_standing to look like
# {
#     '123': datetime()
#     '345': datetime()
#     '456': datetime()
# }

In [30]:
accounts_in_good_standing = dict()

for i in true_enrollments:
    # The account is currently active i.e. is_canceled is False
    if i['days_to_cancel'] is not None and i['cancel_date'] is not None: # 1
        if not i['is_canceled'] or i['days_to_cancel'] > 7: # days_to_cancel is greater than 7 i.e. the trial period
            # The latest vald record for each particular VALID user
            if i['account_key'] not in accounts_in_good_standing or i['join_date'] > accounts_in_good_standing[i['account_key']]:
                accounts_in_good_standing[i['account_key']] = i['join_date']

In [31]:
len(accounts_in_good_standing)

445

In [32]:
accounts_in_good_standing

{'429': datetime.datetime(2018, 3, 10, 0, 0),
 '60': datetime.datetime(2018, 1, 14, 0, 0),
 '322': datetime.datetime(2018, 2, 12, 0, 0),
 '584': datetime.datetime(2018, 1, 14, 0, 0),
 '458': datetime.datetime(2017, 11, 10, 0, 0),
 '1058': datetime.datetime(2018, 1, 14, 0, 0),
 '45': datetime.datetime(2017, 11, 10, 0, 0),
 '315': datetime.datetime(2017, 11, 10, 0, 0),
 '408': datetime.datetime(2018, 4, 1, 0, 0),
 '51': datetime.datetime(2018, 3, 10, 0, 0),
 '323': datetime.datetime(2017, 11, 10, 0, 0),
 '130': datetime.datetime(2018, 4, 1, 0, 0),
 '550': datetime.datetime(2018, 5, 28, 0, 0),
 '44': datetime.datetime(2017, 11, 10, 0, 0),
 '440': datetime.datetime(2017, 11, 11, 0, 0),
 '57': datetime.datetime(2017, 11, 11, 0, 0),
 '1090': datetime.datetime(2017, 11, 11, 0, 0),
 '541': datetime.datetime(2017, 11, 12, 0, 0),
 '756': datetime.datetime(2017, 11, 15, 0, 0),
 '101': datetime.datetime(2017, 11, 12, 0, 0),
 '800': datetime.datetime(2018, 3, 4, 0, 0),
 '702': datetime.datetime(201

In [33]:
def remove_free_trials(dataset):
    free_trials_list = []
    for i in dataset:
        if i['account_key'] in accounts_in_good_standing:
            free_trials_list.append(i)
    return free_trials_list

quality_enrollments = remove_free_trials(true_enrollments)
quality_engagements = remove_free_trials(true_engagements)
quality_submissions = remove_free_trials(true_submissions)

In [34]:
print(f"Quality enrollment records: {len(quality_enrollments)}")
print(f"Quality engagement records: {len(quality_engagements)}")
print(f"Quality submission records: {len(quality_submissions)}")

Quality enrollment records: 719
Quality engagement records: 55675
Quality submission records: 1110


In [35]:
quality_engagements[0]

{'utc_date': '11/10/2017',
 'num_courses_visited': '2',
 'total_minutes_visited': '136.1835995',
 'lessons_completed': '0',
 'projects_completed': '0',
 'account_key': '3'}

In [36]:
# How many students were active on the website within the first week?

def engagement_within_first_week(date_joined, date_of_first_engagement):
    time_difference = date_of_first_engagement - date_joined
    if time_difference.days >= 0 and time_difference.days < 8:
        return True
    
for i in quality_engagements:
    if i['num_courses_visited'] > 0:
        i['has_visited'] = 1
    else:
        i['has_visited'] = 0
        
first_week_engagements = list()

for i in quality_engagements:
    if engagement_within_first_week(accounts_in_good_standing[i['account_key']], i['utc_date']):
        first_week_engagements.append(i)

TypeError: '>' not supported between instances of 'str' and 'int'

In [37]:
print(f"{len(first_week_engagements)} engagements within the first week.")

NameError: name 'first_week_engagements' is not defined

In [38]:
first_week_engagements[0]

NameError: name 'first_week_engagements' is not defined

In [39]:
# How many minutes did each students spend online in the first week?
from collections import defaultdict

def compare(dataset, a_key):
    new_data = defaultdict(list)
    for i in dataset:
        account_key = i[a_key]
        new_data[account_key].append(i)
    return new_data

engagements_grouped_by_account = compare(first_week_engagements, 'account_key')

NameError: name 'first_week_engagements' is not defined

In [40]:
engagements_grouped_by_account['3'][0]

NameError: name 'engagements_grouped_by_account' is not defined

In [41]:
# engagements_grouped_by_account['0']

In [42]:
def sum_records(dataset, column):
    new_data = dict()
    for k, v in dataset.items():
        sum_ = 0
        for record in v:
            sum_ += record[column]
        new_data[k] = sum_
    return new_data

minutes_per_account = sum_records(engagements_grouped_by_account, 'total_minutes_visited')

NameError: name 'engagements_grouped_by_account' is not defined

In [43]:
minutes_per_account

NameError: name 'minutes_per_account' is not defined

## HOMEWORK QUESTIONS

In [44]:
# HOMEWORK:

# Take a look at data and come up with 3 good questions and use some of the methods done in this 
# Jupyter Notebook to successfully answer the questions.

##### 1) Which lesson had the highest pass rate?
##### 2) Which lesson had the lowest pass rate?
##### 3) Which lesson was the most popular and which was the least?

In [45]:
true_submissions[0]

{'creation_date': '1/14/2018',
 'completion_date': '1/16/2018',
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [46]:
y = 0
lesson_keys = []

for i in true_submissions:
    if true_submissions[y]['lesson_key'] not in lesson_keys:
        lesson_keys.append(true_submissions[y]['lesson_key'])
    y+=1

In [47]:
lesson_keys

['3176718735',
 '3165188753',
 '3168208620',
 '3174288624',
 '4110338963',
 '3184238632',
 '4576183932',
 '4582204201',
 '3562208770',
 '4180859007',
 '746169184']

In [71]:
lessons_dict = {}

for i in lesson_keys:
    if i not in lessons_dict:
        lessons_dict[i] = {'PASSED':0, 'INCOMPLETE': 0}

In [72]:
lessons_dict

{'3176718735': {'PASSED': 0, 'INCOMPLETE': 0},
 '3165188753': {'PASSED': 0, 'INCOMPLETE': 0},
 '3168208620': {'PASSED': 0, 'INCOMPLETE': 0},
 '3174288624': {'PASSED': 0, 'INCOMPLETE': 0},
 '4110338963': {'PASSED': 0, 'INCOMPLETE': 0},
 '3184238632': {'PASSED': 0, 'INCOMPLETE': 0},
 '4576183932': {'PASSED': 0, 'INCOMPLETE': 0},
 '4582204201': {'PASSED': 0, 'INCOMPLETE': 0},
 '3562208770': {'PASSED': 0, 'INCOMPLETE': 0},
 '4180859007': {'PASSED': 0, 'INCOMPLETE': 0},
 '746169184': {'PASSED': 0, 'INCOMPLETE': 0}}

In [79]:
x = 0

for i in true_submissions:
    a = i['lesson_key']
    if i['lesson_key'] in lessons_dict and i['assigned_rating'] == 'INCOMPLETE':
        lessons_dict[a]['INCOMPLETE'] += 1
    elif i['lesson_key'] in lessons_dict and i['assigned_rating'] == 'PASSED':
        lessons_dict[a]['PASSED'] += 1
    else:
        continue
        
#     print(a)

In [80]:
lessons_dict

{'3176718735': {'PASSED': 643, 'INCOMPLETE': 816},
 '3165188753': {'PASSED': 208, 'INCOMPLETE': 396},
 '3168208620': {'PASSED': 357, 'INCOMPLETE': 277},
 '3174288624': {'PASSED': 122, 'INCOMPLETE': 172},
 '4110338963': {'PASSED': 12, 'INCOMPLETE': 18},
 '3184238632': {'PASSED': 52, 'INCOMPLETE': 74},
 '4576183932': {'PASSED': 144, 'INCOMPLETE': 48},
 '4582204201': {'PASSED': 85, 'INCOMPLETE': 99},
 '3562208770': {'PASSED': 0, 'INCOMPLETE': 0},
 '4180859007': {'PASSED': 1, 'INCOMPLETE': 2},
 '746169184': {'PASSED': 3, 'INCOMPLETE': 5}}

In [117]:
total = lessons_dict['4180859007']['PASSED'] / (lessons_dict['4180859007']['PASSED'] + lessons_dict['4180859007']['INCOMPLETE'])
lesson_1 = round((total * 100), 2) 
lesson_1

33.33

In [156]:
# Pass Rate

for i in lessons_dict:
    if lessons_dict[i]['PASSED'] == 0 and lessons_dict[i]['INCOMPLETE'] == 0:
        print(f' Lesson Key: 3562208770 has no passes or incompletions.')
        continue   
    total = (lessons_dict[i]['PASSED']) / ((lessons_dict[i]['PASSED'] + lessons_dict[i]['INCOMPLETE']))
    print(f' Lesson Key: {i} has a pass rate of {round((total * 100), 2)}%')

 Lesson Key: 3176718735 has a pass rate of 44.07%
 Lesson Key: 3165188753 has a pass rate of 34.44%
 Lesson Key: 3168208620 has a pass rate of 56.31%
 Lesson Key: 3174288624 has a pass rate of 41.5%
 Lesson Key: 4110338963 has a pass rate of 40.0%
 Lesson Key: 3184238632 has a pass rate of 41.27%
 Lesson Key: 4576183932 has a pass rate of 75.0%
 Lesson Key: 4582204201 has a pass rate of 46.2%
 Lesson Key: 3562208770 has no passes or incompletions.
 Lesson Key: 4180859007 has a pass rate of 33.33%
 Lesson Key: 746169184 has a pass rate of 37.5%


In [162]:
# Total attemps 

for i in lessons_dict:
    total = lessons_dict[i]['PASSED'] + lessons_dict[i]['PASSED']
    print(f' Lesson Key: {i} had {total} total graded attempts')

 Lesson Key: 3176718735 had 1286 total graded attempts
 Lesson Key: 3165188753 had 416 total graded attempts
 Lesson Key: 3168208620 had 714 total graded attempts
 Lesson Key: 3174288624 had 244 total graded attempts
 Lesson Key: 4110338963 had 24 total graded attempts
 Lesson Key: 3184238632 had 104 total graded attempts
 Lesson Key: 4576183932 had 288 total graded attempts
 Lesson Key: 4582204201 had 170 total graded attempts
 Lesson Key: 3562208770 had 0 total graded attempts
 Lesson Key: 4180859007 had 2 total graded attempts
 Lesson Key: 746169184 had 6 total graded attempts


### Answers

In [160]:
print("""1) Lesson 4576183932 had the highest pass rate
2) Lesson 4180859007 had the lowest pass rate
3) Lesson 3562208770 was the least popular and lesson 3176718735 was the most popular
""")


1) Lesson 4576183932 had the highest pass rate
2) Lesson 4180859007 had the lowest pass rate
3) Lesson 3562208770 was the least popular and lesson 3176718735 was the most popular

