In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import acquire
import prepare
from env import get_db_url
import time
import warnings
warnings.filterwarnings("ignore")
import explore

### Questions to keep in mind (5 should be answered very thoroughly. Could touch on all, but 5 min- for MVP)

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
7. Which lessons are least accessed?
8. Anything else I should be aware of?

In [None]:
df = acquire.get_log_data()

In [None]:
df.head()

#### initial thoughts
- I want to look into this a little further to make sure we wouldn't be losing something important, but could consider dropping, especially for MVP
- deleted_at column should be dropped
- data, time, start_date, end_date, created_at, updated_at - by the names of these columns, thinking can convert to datetime
- path probably makes sense to stay object, but can we potentially look at groups/feature eng columns here? possibility to keep in mind
- user_id as int - this is fine for now, but want to make sure this is not treated as continuous I'm thinking
- cohort id- could potentially change the float out with the actual cohort name
- ip - probably good, also potential for feature eng... like if we want to look at different area of users. here is a good site to help https://www.whatismyip.com/ip-address-lookup/
- program_id - fine, but also can change to program name if we'd like

#### Histogram

In [None]:
df.date.head()

In [None]:
df.time.head()

In [None]:
df.start_date.head()

In [None]:
df.end_date.head()

In [None]:
df.created_at.head()

In [None]:
df.updated_at.head()

In [None]:
df[['date', 'start_date', 'end_date', 'created_at', 'updated_at']] = df[['date','start_date', 'end_date', 'created_at', 'updated_at']].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S.%f')

In [None]:
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S' ).apply(pd.Timestamp)

In [None]:
df.info()

In [None]:
df.info()

In [None]:
df.hist(bins=30, figsize=(20, 15))

In [None]:
df.date.min() , df.date.max()

In [None]:
df.start_date.min() , df.start_date.max()

In [None]:
df.end_date.min() , df.end_date.max()

In [None]:
df.time.head()

In [None]:
df.info()

In [None]:
df['path'].nunique()

In [None]:
df['path'].value_counts()[:30].plot(kind='bar')

In [None]:
df.ip.nunique()

In [None]:
df['ip'].value_counts()[:30].plot(kind='bar')

note - interesting the one ip address and the second highest too. maybe the instructors share one often working from same comp? or could be a scraping thing. keep in mind

In [None]:
df.name.nunique()

In [None]:
df['name'].value_counts()[:47].plot(kind='bar', figsize=(12,5))
plt.show()

In [None]:
df['slack'].value_counts().plot(kind='bar', figsize=(12,5))

In [None]:
df['slack'] = df['slack'].str[1:]

In [None]:
df.slack.value_counts()

In [None]:
df['slack'] = df.slack.replace({'taff': 'staff'})

In [None]:
df['slack'].value_counts()

In [None]:
df['slack'].groupby(df.name).max()

In [None]:
df.cohort_id.nunique()

In [None]:
df['slack'].nunique(), df.name.nunique()

this is what's not right 

Bayes              staff

should be fine just using name as it's basically the same, but has bayes and staff

summary so far-
- datatypes for date time
- drop slack
- drop delete column
- drop the null values 
- should be about 850k records
- look at dtypes closely and make changes where desired (maybe name instead of number for a couple)
- maybe change name to cohort, lower case
- cohort id corresponds to name. do you need both? probably ok to keep both incase
- keep the couple very high counts ip addresses in mind
- talk with group about what they found and put in function


### Questions to keep in mind (5 should be answered very thoroughly. Could touch on all, but 5 min- for MVP)

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
7. Which lessons are least accessed?
8. Anything else I should be aware of?

In [None]:
df.info()

## Explore: Question 3 

### Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

### Initial thoughts about question:
- First, we are looking at only active students for this question. That means I am creating a df that has records where the date (date of access) falls is between (or on) start date and end date
- what could we find out about the students? 
    - what is "low access"- look at numbers here and %s. make some bins of average, low, very low, etc.
    - what cohort they are in? do students in certain cohorts tend to access a lot less? what about program?
    - could very low access be due to students dropping out of the program possibly?
    - do students access more, less, same early on? near graduation?
    - do the low access students have ip addresses local to SA, not local? appear to be remote or not? covid 19 impact on access?
    - else?

In [3]:
df = prepare.prepare_logs()

In [4]:
df3 = explore.explore_question3(df)

In [None]:
df3.program_id.value_counts()

In [None]:
afgavav

In [None]:
df3 = df3.rename(columns={'program_id': 'program', 'name': 'cohort'})
#rename columns just for preference
df3 = df3.dropna()
#drops nulls, records without sufficient data about student access
df3['program_access'] = (df3.date_time >= df3.start_date) & (df3.date <= df.end_date)
# creates boolean column to weed out everything, but active students
df3 = df3[(df3['program_access'] == True) & (df3.staff == False)] 
#creates df of active students that are not staff

In [None]:
df3.info()

In [None]:
df3['program_id'] = df3['program_id'].astype('object')

In [None]:
df3['cohort_program'] = df3['name'] + df3['program_id']

In [None]:
cohort_counts = df3.name.value_counts()
cohort_counts
#check to make sure no staff and lowest access

notes: gives overview, but to be fair, this doesn't mean much without comparing this to the amount of students in that cohort. looking at that...

In [None]:
user_counts = df3.user_id.groupby(df3.name).nunique()
user_counts

In [None]:
round(cohort_counts/user_counts).sort_values()
#this is average active student access per cohort
#of course this does not show individual students yet that have low access, just overview by cohort

In [None]:
round(cohort_counts/user_counts).sort_values().plot(kind='bar', hue='program_id', data=df3)

In [None]:
df = df.drop(columns='deleted_at')

In [None]:
df= df.dropna()

In [None]:
[(df.date >= df.start_date) & (df.date <= df.end_date)]

In [None]:
df['program_access'].value_counts()

In [None]:
df['staff'] = (df.name == 'Staff')

In [None]:
df.staff.value_counts()

In [None]:
active.info()

In [None]:
active.user_id.value_counts().tail(20)

In [None]:
counts = active.user_id.value_counts()
low = active[active['user_id'].isin(counts[counts < 100].index)]

In [None]:
low.info()

In [None]:
df[df.created_at != df.updated_at].groupby(df.name).min()

In [None]:
len(df)

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(low.name, hue='program_id', data=low)
plt.show()