In [None]:
# importing all packages
import pandas as pd
import gzip
import re
from urllib.parse import urlparse, parse_qsl, unquote
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math
import numpy as np
import datetime 
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display
import plotly.graph_objects as go 
import plotly.express as px
import asyncio
from plotly.subplots import make_subplots

%matplotlib inline

In [None]:
!pip install google-cloud-bigquery
!pip install pandas-gbq

# Authenticate to GCP in the shell, like this:
# gcloud auth application-default login

from google.cloud import bigquery
from pandas.io import gbq

import os
current_dir = "/home/jovyan/datahub-usage-analysis/dashboard"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=current_dir+"xxx.json"

# Set up BigQuery client
project_id = 'ucb-datahub-2018'
client = bigquery.Client(project=project_id)

# Define SQL query
# replace the FROM... with the BigQuery table you want to query
query = """
SELECT *
FROM `ucb-datahub-2018.datahub_fa24.stderr_*`
"""

#opens up fall dataset
nbgitpuller_fall23 = '/home/jovyan/discovery-su24-dataset/nbgitpuller-clicks-fall-2023.jsonl.gz'
nbgitpuller_fall23 = pd.read_json(gzip.open(nbgitpuller_fall23), lines = True)

# opens up spring dataset
nbgitpuller_filename = '/home/jovyan/discovery-su24-dataset/nbgitpuller-clicks-sp24.jsonl.gz'
nbgitpuller_sp24 = pd.read_json(gzip.open(nbgitpuller_filename), lines = True)

# combining both datasets, spring first then fall 
nbgitpuller_df = pd.concat([nbgitpuller_sp24, nbgitpuller_fall23], ignore_index=True)

In [None]:
# Execute query and load data into DataFrame
nbgitpuller_df = gbq.read_gbq(query, project_id=project_id)

nbgitpuller_df.head()

In [None]:
nbgitpuller_df.to_csv("nbgitpuller_su24.csv")

In [None]:
# obtaining substring after GET and before the redirection
urls_all = nbgitpuller_df.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip())

# uses urllib.parse to parse the url into path and query
urls_parsed_all = urls_all.apply(lambda x: urlparse(x))

# uses the parsed urls to obtain the action from the path
nbgitpuller_df['actions'] = urls_parsed_all.apply(lambda x: os.path.basename(x.path))

In [None]:
# function to determine the filetypes
def path_extension_puller(row):
    """
    pandas row function; uses apply
    function to pull out select file extensions and urlpaths
    """
    row_dict = dict(row)
    if 'urlpath' in row_dict:
        key = 'urlpath'
    elif 'subPath' in row_dict:
        key = 'subPath'
    else: 
        return 'NaN', 'NaN'
    
    # files that the analysis is interested in 
    file_extension_list = ['ipyn[b]?', 'Rmd', 'pdf', 'txt', 'xml', 'ini', 'csv', 'py', 'R', 'md']
    if len(row_dict[key].split('.')) > 1:
        file_extension_split_string = row_dict[key].split('.')[-1]
        for file_extension in file_extension_list:
            if (len(re.findall(file_extension, file_extension_split_string)) > 0):
                return row_dict[key], re.findall(file_extension, file_extension_split_string)[-1]
        else:
            return row_dict[key], 'NaN'
    else:
            return row_dict[key], 'NaN'

def get_repo(row):
    """
    pandas row function; uses apply
    returns repo url from parsed url
    """
    for item in row:
        key, value = item
        if 'repo' in key:
            return unquote(value)
    return 'NaN'

def repo_parsing(row):
    """
    pandas row function; uses apply
    parses the repo url so that it obtains the user and folder/content user is accessing
    """
    if row:
        if len(row[0].split('/')) > 2:
            return row[0].split('/')[1]
        else:
            return row[0].split('/')[-1]
    else:
        return 'NaN'

In [None]:
# makes a new dataframe that only contains git-pull and resets index
nbgitpuller_df_pull = nbgitpuller_df[nbgitpuller_df.actions == 'git-pull'].reset_index()

# obtains all the log info
log_info_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: ''.join(re.findall("\[.*\]", x)).replace('[', '').replace(']', '').split(' '))

# retreives the hubs for each textpayload
hub_source_pull = nbgitpuller_df_pull.resource.apply(lambda x: x['labels']['namespace_name'])

# obtains substring after GET and before the redirection
urls_pull = nbgitpuller_df_pull.textPayload.apply(lambda x: x[x.find('GET')+3:x.find('->')].strip())

# uses urllib.parse to parse the url into path and query
urls_parsed_pull = urls_pull.apply(lambda x: urlparse(x))

# uses parsed urls to obtain the action as a quality check
actions_pull = urls_parsed_pull.apply(lambda x: os.path.basename(x.path))

# breaks apart the parsed query into repo/urlpath
urls_queries_pull = urls_parsed_pull.apply(lambda x: parse_qsl(x.query))

# getting the file type from urlpath
path_extension_pull = urls_queries_pull.apply(path_extension_puller)

# gets repo urls from the parsed url
repos_pull = urls_queries_pull.apply(get_repo)

# extract ones that have github.com in the repo url or else its a null value
repos_parsed_pull = repos_pull.apply(lambda x: re.findall("github\.com/+(.+)", x) if x else 'NaN')

# obtains the user and git content from github.com repo urls
git_user_pull = repos_parsed_pull.apply(lambda x: x[0].split('/')[0] if x else 'NaN')
git_user_repo_pull = repos_parsed_pull.apply(repo_parsing)

# adds it all into a dataframe
nbgitpuller_textPayload_df_pull = pd.DataFrame({'log_info_type': log_info_pull.apply(lambda x: x[0]),
                                           'timestamp_date': log_info_pull.apply(lambda x: x[1]),
                                           'timestamp_time': log_info_pull.apply(lambda x: x[2]),
                                           'action': actions_pull,
                                           'git_query': urls_queries_pull,
                                           'repo': repos_pull,
                                           'git_user_content': repos_parsed_pull,
                                           'git_user': git_user_pull,
                                           'git_content': git_user_repo_pull,
                                           'git_path': path_extension_pull.apply(lambda x: x[0]),
                                           'file_extension': path_extension_pull.apply(lambda x: x[1]),
                                           'hub': hub_source_pull})

In [None]:
nbgitpuller_textPayload_df_pull['git_user_content_path'] = nbgitpuller_textPayload_df_pull.apply(lambda x: ''.join(x['git_user_content']) + '/' + ''.join(x['git_path']), axis = 1)

In [None]:
def course_assigner_regex(row):
    """
    pandas row function; uses apply
    determines which classes and semesters are for each github repo
    """
    courses = {'(data8|ds8)': 'data8', '(ds100|data100)': 'data100', '(prob140)': 'data140', #data
               '(caldataeng|data101|ds101)': 'data101', '(data6|ds6)': 'data6', '(data102|ds102)': 'data102', #data
               '(data4ac|ds4ac)': 'data4ac', '(data198|ds198)': 'data198',
               '(cs189|compsci189)': 'compsci189', '(cs170|compsci170)': 'compsci170', #compsci 
               '(ee16a|eecs16a)': 'eecs16a', '(ee16b|eecs16b)': 'eecs16b', '(eecs127)': 'eecs127',#eecs
               '(ee120|eleng120)': 'eleng120', #electrical engineering
               '(physics111b)': 'physics111b', '(physics88)': 'physics88', # physics
               '(polsci3|ps3|polisci3)': 'polsci3', '(polsci5|ps5)': 'polsci5', '(polsci88|ps88)': 'polsci88', '(ps109|polsci109)': 'polsci109', # polisci
               '(ce190|civeng90)': 'civgeng190', '(ce93|civeng93)': 'civeng93', '(ce200b|civeng200b)': 'civeng200b', '(ce110|civeng110)': 'civeng110', #civileng
               '(envecon118|eep118)': 'envecon118', '(eep147|envecon147)': 'envecon147', '(eep153|envecon153)': 'envecon153', #environmental
               'ph[w]?142': 'pbhlth142', 'ph[w]?251': 'pbhlth251', 'ph[w]?290': 'pbhlth290', 'ph[w]?252': 'pbhlth252', 'ph[w]?253': 'pbhlth253', 'pbhlth250c': 'pbhlth250c',
               'ph[w]?196': 'pbhlth196', # public health
               'mcb163l': 'mcellbi163l', 'mcb280': 'mcellbi280', 'mcbc117': 'mcellbic117', 'mcb32': 'mcellbi32', 'mcb288': 'mcellbi288', #molecular cell bio
               '(bio1b|biology1b)': 'biology1b', # biology
               'stat88': 'stat88', 'stat157': 'stat157', 'stat159': 'stat159', 'stat131': 'stat131', 'stat135': 'stat135', 'stat20': 'stat20', 
               'stat150': 'stat150', #stat
               'math124': 'math124', #math
               '(demog180)': 'demog180', 'demog[c]?175': 'demog175', #demography
               '(eps130)': 'eps130', '(eps88)': 'eps88', 'eps256': 'eps256', 'eps24': 'eps24',
               '(econ140)': 'econ140', '(econ148)': 'econ148', 'econ141': 'econ141', 'econ172': 'econ172', 'econ151': 'econ151', #econ
               'econ157': 'econ157', 'econ130': 'econ130', 'econ143': 'econ143', 'econ135': 'econ135',
               '(rbridge)': 'datasci_rbridge', '(midsw241)': 'datasci241', '(midsw203)': 'datasci203', #datasci
               '(legal123|legalst123)': 'legalst123', '(legalst190|legal190)': 'legalst190', # legal
               '(es22ac|ethstd22ac)': 'ethstd22ac', '(esc164a|ethstdc164a)': 'ethstdc164a', '(es21ac|ethstd21ac)': 'ethstd21ac',  # ethnic studies
               'cp201b': 'cyplan201b', '(cityplanning88|cp88)': 'cyplan88', 
               'ib120': 'integbi120', 'ibc32': 'integbi32', 'ib134l': 'integbi134l',
               'mse104l': 'matsci104l',
               'are212': 'aresec212',
               'educw142': 'educw142',
               '(cogscic131|psych123)': 'cogscic131', 'psych198': 'psych198',
               'anth[ro]?115': 'anthro115',
               'espmc167': 'espmc167', '(ibespm105)': 'espmc105',
               'ls88': 'ls88',
               'dighum101': 'dighum101', 'dighum160': 'dighum160',
               'plantbi135': 'plantbi135',
               'hist160': 'history160',
               'soc88': 'sociol88', 'sw282': 'socwel282',
               'music30': 'music30', 'artw23ac': 'artw23ac'} 
    # hard coded
    git_content_user = {'danielabrahamgit120': 'eleng120', 'evalencialopezw142': 'educw142', 'charismasacey[A-Za-z0-9]+cp201': 'cp201a'}

    #strips anything thats not a letter or number
    git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower()
    for key in courses:
        if re.findall(key, git_string_cleaned):
            return courses[key]
    for key in git_content_user:
        if re.findall(key, git_string_cleaned):
            return git_content_user[key]
    else:
        return 'unknown'
    

In [None]:
# assigns classes/courses to each log
nbgitpuller_textPayload_df_pull['course'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(course_assigner_regex)

In [None]:
def semester_assigner_regex(row):
    """
    pandas row function; uses apply
    returns the semester of the course material if known
    """
    semester = [r'fa[ll]*\d{1,4}', r'su[mmer]*\d{1,4}', r'sp[ring]*\d{1,4}', r'\d{1,4}fa[ll]', r'\d{1,4}su[mmer]*', r'\d{1,4}sp[ring]*']
    sem_match_dict = {'sp': 'spring', 'fa': 'fall', 'su':'summer'}

    git_string_cleaned = re.sub(r'[^a-zA-Z0-9]', '', ''.join(row)).lower()

    year_range = [2018, datetime.datetime.now().year]

    for sem in semester:
        try:
            if re.findall(sem, git_string_cleaned):
                sem_match = re.findall(sem, git_string_cleaned)[-1]
                sem_match_split = re.split('(\d+)', sem_match)
                sem_char = re.findall('[a-z]+', sem_match)[-1]
                sem_year = re.findall('[0-9]+', sem_match)[-1]
                for key, value in sem_match_dict.items():
                    if key in sem_char and sem_match_split[-1] == '':
                        if len(sem_year) < 4:
                            if year_range[0] <= int(f'20{sem_year[-2:]}') <= year_range[1]:
                                return f'{value}20{sem_year[-2:]}'
                            else:
                                return 
                        elif len(sem_year) == 4:
                            if year_range[0] <= int(sem_year) <= year_range[1]:
                                return f'{value}{sem_year}'
                            else:
                                return 'unknown'
                    elif key in sem_char and sem_match_split[-1] != '':
                        if year_range[0] <= int(sem_year) <= year_range[1]:
                            return f'{value}{sem_year}'
                        else:
                            return 'unknown'
        except Exception as e:
            print(f"Failed findall: {e=} {sem=} {git_string_cleaned=}")
            continue
    else:
        return 'unknown'


In [None]:
# assigns a semester to each log
nbgitpuller_textPayload_df_pull['semester'] = nbgitpuller_textPayload_df_pull.git_user_content_path.apply(semester_assigner_regex)

In [None]:
# transforms timestamp into one and converts from UTC to PST
nbgitpuller_textPayload_df_pull['timestamp_date_time_pst'] = pd.to_datetime(nbgitpuller_textPayload_df_pull.timestamp_date + ' ' + nbgitpuller_textPayload_df_pull.timestamp_time) - pd.Timedelta(8, unit = 'h')

In [None]:
# for ones that have NaN as their filetype, check if git_path contains r_studio
nbgitpuller_textPayload_df_pull['file_extension'] = nbgitpuller_textPayload_df_pull.apply(lambda x: 'rstudio' if 'rstudio' in x['git_path'] else x['file_extension'], axis = 1)

In [None]:
# determines if the links are github or non-github
nbgitpuller_textPayload_df_pull['abnormal'] = nbgitpuller_textPayload_df_pull.repo.apply(lambda x: 'N' if 'github.com' in x else 'Y')

In [None]:
nbgitpuller_textPayload_df_pull.head()

In [None]:
# separates abnormal repos 
nbgitpuller_textPayload_df_pull_abnormal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'Y']
nbgitpuller_textPayload_df_pull_normal = nbgitpuller_textPayload_df_pull[nbgitpuller_textPayload_df_pull.abnormal == 'N']

In [None]:
import pickle

In [None]:
# Save DataFrame to a pickle file
with open('nbgitpuller_df_serialized.pkl', 'wb') as file:
    pickle.dump(nbgitpuller_textPayload_df_pull_normal, file)

In [None]:
# Save DataFrame to a pickle file
with open('nbgitpuller_df_abnormal_serialized.pkl', 'wb') as file:
    pickle.dump(nbgitpuller_textPayload_df_pull_abnormal, file)