In [339]:
## representing a csv file as a list of rows
import unicodecsv

def readCsv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = readCsv('../../intro_to_data_analysis/enrollments.csv')
daily_engagement = readCsv('../../intro_to_data_analysis/daily_engagement.csv')
project_submissions = readCsv('../../intro_to_data_analysis/project_submissions.csv')
print(enrollments[0])
print(daily_engagement[0])
print(project_submissions[0])

OrderedDict([('account_key', '448'), ('status', 'canceled'), ('join_date', '2014-11-10'), ('cancel_date', '2015-01-14'), ('days_to_cancel', '65'), ('is_udacity', 'True'), ('is_canceled', 'True')])
OrderedDict([('acct', '0'), ('utc_date', '2015-01-09'), ('num_courses_visited', '1.0'), ('total_minutes_visited', '11.6793745'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])
OrderedDict([('creation_date', '2015-01-14'), ('completion_date', '2015-01-16'), ('assigned_rating', 'UNGRADED'), ('account_key', '256'), ('lesson_key', '3176718735'), ('processing_state', 'EVALUATED')])


In [340]:
ACCT_KEY = 'account_key'

In [341]:
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [342]:
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [343]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [344]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [345]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [346]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [347]:
len(enrollments)

1640

In [348]:
def getAccountSet(dict):
    tempSet = set()
    for i in dict:
        tempSet.add(i[ACCT_KEY])
    
    return tempSet

In [349]:
for item in daily_engagement:
    item[ACCT_KEY] = item['acct']
    del item['acct']
    
daily_engagement[0]

OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0),
             ('account_key', '0')])

In [350]:
def countUnique(dictionary, fieldName):
    temp = {}
    for i in dictionary:
        temp[i[fieldName]] = 1
    
    return len(temp)
    
enrollment_num_rows = len(enrollments)             # Replace this with your code
enrollment_num_unique_students = countUnique(enrollments, ACCT_KEY)  # Replace this with your code

engagement_num_rows = len(daily_engagement)             # Replace this with your code
engagement_num_unique_students = countUnique(daily_engagement, ACCT_KEY)  # Replace this with your code

submission_num_rows = len(project_submissions)             # Replace this with your code
submission_num_unique_students = countUnique(project_submissions, ACCT_KEY)  # Replace this with your code

print(enrollment_num_rows)
print(enrollment_num_unique_students)

print(engagement_num_rows)
print(engagement_num_unique_students)

print(submission_num_rows)
print(submission_num_unique_students)

1640
1302
136240
1237
3642
743


In [351]:
students = getAccountSet(enrollments)
daily_students = getAccountSet(daily_engagement)

In [352]:
missingStudents = students.difference(daily_students)

In [353]:
{item for item in daily_engagement if (item[ACCT_KEY] in missingStudents)}

set()

In [354]:
idx = 0
for item in enrollments:
    if ( item[ACCT_KEY] in missingStudents ):
        if (item['days_to_cancel'] or item['days_to_cancel'] == None ):
            print(item)
            if idx > 3:
                break
            idx += 1

OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 1, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 3, 10, 0, 0)), ('days_to_cancel', 59), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 3, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 6, 17, 0, 0)), ('days_to_cancel', 99), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1101'), ('status', 'current'), ('join_date', datetime.datetime(2015, 2, 25, 0, 0)), ('cancel_date', None), ('days_to_cancel', None), ('is_udacity', True), ('is_canceled', False)])


In [355]:
paid_students = [item for item in enrollments if ( item['is_udacity'] == False and \
                                    (( item['days_to_cancel'] == None ) or ( item['days_to_cancel'] > 7)))]
len(paid_students)

1215

In [356]:
len(getAccountSet(paid_students))

995

In [357]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [358]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [359]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))

1622
135656
3634


In [360]:
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.

paid_students = {item[ACCT_KEY]:item['join_date'] for item in non_udacity_enrollments if item['cancel_date']==None or (item['cancel_date']-item['join_date']).days > 7}
len(paid_students)

995

In [361]:
paid_students = {}

for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
        account_key = enrollment[ACCT_KEY]
        enrollment_date = enrollment['join_date']
        
        if account_key not in paid_students or \
                enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date
                
len(paid_students)

995

In [362]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

In [363]:
#####################################
#                 7                 #
#############################dd########

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.
#paid_accounts_set = getAccountSet(paid_students)
paid_engagement_in_first_week = [item for item in non_udacity_engagement \
                                if item[ACCT_KEY] in paid_students and \
                                within_one_week(paid_students[item[ACCT_KEY]],item['utc_date'])]
len(paid_engagement_in_first_week)

6919

In [369]:
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
def group_data(data, key_name):
    grouped_data = defaultdict(list)
    for data_point in data:
        key = engagement_record[key_name]
        grouped_data[key].append(data_point)
    return grouped_data
    
# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = group_data(paid_engagement_in_first_week, ACCT_KEY)

In [373]:
def sum_grouped_items(grouped_data, field_name):
    # Create a dictionary with the total minutes each student spent in the classroom during the first week.
    # The keys are account keys, and the values are numbers (total minutes)
    summed_data = {}
    for key, data_points in grouped_data.items():
        total = 0
        for data_point in data_points:
            total += data_point[field_name]
        summed_data[key] = total
        
    return summed_data

total_minutes_by_account = sum_grouped_items(engagement_by_account, 'total_minutes_visited')
total_minutes_by_account

{'1305': 305174.785119662}

In [372]:
import numpy as np

def describe_data(data):
    # Summarize the data about minutes spent in the classroom
    lData = list(data.values())
    print('Mean:', np.mean(lData))
    print('Standard deviation:', np.std(lData))
    print('Minimum:', np.min(lData))
    print('Maximum:', np.max(lData))
    
describe_data(total_minutes_by_account)

{'1305': 305174.785119662}
Mean: 305174.78512
Standard deviation: 0.0
Minimum: 305174.78512
Maximum: 305174.78512


In [338]:
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = total_minutes_by_account.values()
listTotalMinutes = list(total_minutes)
print('Mean:', np.mean(listTotalMinutes))
print('Standard deviation:', np.std(listTotalMinutes))
print('Minimum:', np.min(listTotalMinutes))
print('Maximum:', np.max(listTotalMinutes))

Mean: 305174.78512
Standard deviation: 0.0
Minimum: 305174.78512
Maximum: 305174.78512


In [318]:
student_with_max_minutes = None
max_minutes = 0

for student, total_minutes in total_minutes_by_account.items():
    if total_minutes > max_minutes:
        max_minutes = total_minutes
        student_with_max_minutes = student
        
print(max_minutes)
print(student_with_max_minutes)

3564.7332644989997
163


In [315]:
def getStudentMaxX(total_X_by_account):
    student_with_max_X = None
    max_X = 0

    for student, total_X in total_X_by_account.items():
        if total_X > max_X:
            max_X = total_X
            student_with_max_X = student
            
    return student_with_max_X

In [312]:
# Create a dictionary with the total lessons_completed for each student during the first week.
# The keys are account keys, and the values are numbers (lessons_completed)
total_lessons_completed_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_lessons_completed = 0
    for engagement_record in engagement_for_student:
        total_lessons_completed += engagement_record['lessons_completed']
    total_lessons_completed_by_account[account_key] = total_lessons_completed

In [314]:
describe_data(total_lessons_completed_by_account)

Mean: 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36


In [316]:
getStudentMaxX(total_lessons_completed_by_account)

'633'

In [319]:
print(engagement_by_account['633'])

[OrderedDict([('utc_date', datetime.datetime(2015, 7, 17, 0, 0)), ('num_courses_visited', 4), ('total_minutes_visited', 489.042620667), ('lessons_completed', 10), ('projects_completed', 0), ('account_key', '633')]), OrderedDict([('utc_date', datetime.datetime(2015, 7, 18, 0, 0)), ('num_courses_visited', 2), ('total_minutes_visited', 264.106566334), ('lessons_completed', 8), ('projects_completed', 0), ('account_key', '633')]), OrderedDict([('utc_date', datetime.datetime(2015, 7, 19, 0, 0)), ('num_courses_visited', 1), ('total_minutes_visited', 173.814019), ('lessons_completed', 3), ('projects_completed', 0), ('account_key', '633')]), OrderedDict([('utc_date', datetime.datetime(2015, 7, 20, 0, 0)), ('num_courses_visited', 2), ('total_minutes_visited', 259.304326334), ('lessons_completed', 6), ('projects_completed', 0), ('account_key', '633')]), OrderedDict([('utc_date', datetime.datetime(2015, 7, 21, 0, 0)), ('num_courses_visited', 1), ('total_minutes_visited', 108.428018), ('lessons_com