In [None]:
%%html  
<style>.cm-s-ipython .CodeMirror-matchingbracket { color: black !important;}</style><!-- Bob J: no green highlights -->
<style>table.dataframe th { vertical-align: top; }</style><!-- Move MultiIndex headers to top of block -->
<style>table.dataframe td, table.dataframe th { border-style: solid; border-width: thin; }</style><!-- thin border around tables -->

In [None]:
%%javascript  # Prefer to display output instead of scrolling, so it can print
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Standard definitions and options
import pandas as pd
from pandas import DataFrame
import os
import numpy as np
from datetime import datetime
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15.0, 10.0)
# widen the page to match the window
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# full width display of tables
pd.options.display.max_columns = 50
# want to display all rows
pd.options.display.max_rows = None

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
import json
# read log in JSON form, as processed by i.e. transfer.php
with open('log.json', 'r') as f:
    samples_dict = json.load(f)

In [None]:
# samples_dict

In [None]:
exerciseName = {
    "TestIntroJava":       " 1 Testing 1 Intro Java",
    "TestIntroCpp":        " 3 Testing 3 Intro C++",
    "TestIntroPy":         " 2 Testing 2 Intro Python",
    "TestSumPrimesJava":   " 4 Testing 4 TestSumPrimes Java",
    "TestSumPrimesPy":     " 5 Testing 5 TestSumPrimes Python",
    "TestSumPrimesCpp":    " 6 Testing 6 TestSumPrimes C++",
    "ProfileStatsJava":    " 7 Performance 1 Java",
    "ProfileStatsCpp":     " 8 Performance 2 C++",
    "MemIntroCpp":         " 9 Memory 1 C++ Memory Checks",
    "GitBasics1":          "10 Git 1 GitBasics1",
    "GitBasics2":          "11 Git 2 GitBasics2",
    "GitBisect":           "12 Git 3 GitBisect",
    "GitHeadBanging":      "13 Git 4 GitHeadBanging"
}


In [None]:
df = DataFrame()
for sample in samples_dict : 
    df = df.append(sample, ignore_index=True)
df = df.replace(np.nan, '', regex=True)

# Add column
def lookUpCanFail(array, item, failValue) :
    try :
        return array[item]
    except:
        return failValue

df['Exercise'] = df.apply(lambda row: \
                    lookUpCanFail(exerciseName, row.item1, ""), axis = 1)

# drop items before exercise starts
# df = df[df.DATE > "2022-09-02 15:30:00"]

#df.head(4)

In [None]:
# extract team names, then drop them
display(df[df["item1"]=="team"])
df = df[df["item1"]!="team"]

In [None]:
# how many total times for each Exercise
t1 = df[df.Exercise.notnull()]
t1 = t1[t1.Exercise != ""]
group = t1.groupby(["Exercise"]).count()
count = group['user']
display(count)
fig, ax = plt.subplots(figsize =(16, 9))
ax.barh(group.index, count)
ax.invert_yaxis()
ax.set_title('Number Attempting Each Exercise');

In [None]:
# show counts of IP addresses
display(df.groupby("host").count()["REMOTE_HOST"])

In [None]:
# display(df.groupby(["user", "host"]).count())

In [None]:
display(df.groupby("user").count())

In [None]:
# look at one user
display(df[df.user=="jake"])

In [None]:
# most recent 'setup' time - this is the start of working on the exercises
t1 = df[df.item1 == 'setup']
t1.drop_duplicates(['user'], keep='last')[["DATE", "IP", "host", "user"]]

In [None]:
# plot the startup time (when the student got to 'source setup', i.e. after checkout)
t1 = df[df.item1 == 'setup']

# restrict to exercise period 
# t1 = t1[t1.DATE > "2019-09-17 04:00:00"]

times = pd.to_datetime(t1["DATE"])
count = np.arange(0, len(times))
plt.plot_date(times, count)
plt.gcf().autofmt_xdate()
plt.title("Setup Time")
plt.show()


In [None]:
# most recent 'Exercise' by user
t1 = df[df.Exercise.notnull()]
t2 = t1.drop_duplicates(['user'], keep='last').sort_values(['user'])
t2[['DATE','user','Exercise']]

In [None]:
# activity near the end 
t2 = t1.drop_duplicates(['user'], keep='last').sort_values(['DATE'])
t2[['DATE','user','Exercise']].tail(20)

In [None]:
# count the number of people with each exercise as most recent
t1 = df[df.item1.notnull()]
t1 = t1[t1.item1 != '']
t2 = t1.drop_duplicates(['user'], keep='last')
t2.sort_values("Exercise")
t2.groupby("Exercise").count()['user']

In [None]:
t1.head(5)

In [None]:
# plot time users started their last exercise
t3 = df[df.item1.notnull()]
t3 = t3[t3.item1 != '']
t3 = t3.drop_duplicates(['user'], keep='last')
t3.sort_values("DATE")
display(t3.head())

#t3 = t3[t3.DATE < "2019-09-18 17:30:00"]
#t3 = t3[t3.DATE > "2019-09-18 15:30:00"]

times = pd.to_datetime(t3["DATE"])
count = np.arange(0, len(times))
plt.plot_date(times, count)
plt.gcf().autofmt_xdate()
plt.title("Time Starting Most Recent Exercise")
plt.show()

