In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns

from env import host, user, password

In [2]:
def get_connection(db, username=user, host=host, password=password):
    '''
    Creates a connection URL
    '''
    return f'mysql+pymysql://{username}:{password}@{host}/{db}'

    
def new_log_data():
    '''
    Returns curriculum log info into a dataframe
    '''
    sql_query = '''  
    SELECT *
    FROM logs
    LEFT OUTER JOIN cohorts
    ON cohorts.id = logs.cohort_id;
    '''
    df = pd.read_sql(sql_query, get_connection('curriculum_logs'))
    return df 

In [3]:
df= new_log_data()

In [4]:
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900218,2021-04-21,16:41:51,jquery/personal-site,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900219,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900220,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900221,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0


In [5]:
df.program_id.value_counts()

2.0    713365
3.0    103412
1.0     30548
4.0         5
Name: program_id, dtype: int64

In [12]:
#Creating english names for the different program_ids 
conditions = [(df.program_id == 1), (df.program_id == 2), (df.program_id == 3), (df.program_id == 4)]
actions = ['php','java', 'ds','fe']
df['program'] = np.select(conditions,actions,default='Other')

In [13]:
program1= df[df.program_id == 1]
program2 = df[df.program_id == 2]
program3 = df[df.program_id == 3]
program4 = df[df.program_id == 4]

In [61]:
program1.path.value_counts(normalize=True).sort_values().nlargest(3)

/               0.055028
index.html      0.033095
javascript-i    0.024093
Name: path, dtype: float64

In [60]:
program2.path.value_counts(normalize=True).sort_values().nlargest(3)

/               0.050204
javascript-i    0.024471
toc             0.024431
Name: path, dtype: float64

In [62]:
program3.path.value_counts(normalize=True).sort_values().nlargest(3)

/                           0.080823
search/search_index.json    0.021303
classification/overview     0.017261
Name: path, dtype: float64

In [63]:
program4.path.value_counts(normalize=True).sort_values().nlargest(3)

content/html-css                               0.4
/                                              0.2
content/html-css/gitbook/images/favicon.ico    0.2
Name: path, dtype: float64

In [69]:
d = program3.groupby(['name'])

In [70]:
d.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
326053,2019-08-20,09:39:58,/,466,34.0,97.105.19.58,34.0,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3.0,ds
326054,2019-08-20,09:39:59,/,467,34.0,97.105.19.58,34.0,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3.0,ds
326055,2019-08-20,09:39:59,/,468,34.0,97.105.19.58,34.0,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3.0,ds
326056,2019-08-20,09:40:02,/,469,34.0,97.105.19.58,34.0,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3.0,ds
326057,2019-08-20,09:40:08,/,470,34.0,97.105.19.58,34.0,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3.0,ds
445311,2020-02-03,15:39:34,login,575,55.0,97.105.19.58,55.0,Curie,#curie,2020-02-03,2020-07-07,2020-02-03 19:31:51,2020-02-03 19:31:51,,3.0,ds
445314,2020-02-03,15:39:35,/,576,55.0,97.105.19.58,55.0,Curie,#curie,2020-02-03,2020-07-07,2020-02-03 19:31:51,2020-02-03 19:31:51,,3.0,ds
445315,2020-02-03,15:39:37,/,577,55.0,97.105.19.58,55.0,Curie,#curie,2020-02-03,2020-07-07,2020-02-03 19:31:51,2020-02-03 19:31:51,,3.0,ds
445316,2020-02-03,15:39:37,login,575,55.0,97.105.19.58,55.0,Curie,#curie,2020-02-03,2020-07-07,2020-02-03 19:31:51,2020-02-03 19:31:51,,3.0,ds
445317,2020-02-03,15:39:43,/,578,55.0,97.105.19.58,55.0,Curie,#curie,2020-02-03,2020-07-07,2020-02-03 19:31:51,2020-02-03 19:31:51,,3.0,ds


In [84]:
d2= d.apply(lambda x: x.path.value_counts(normalize=True).sort_values(ascending = False))

In [85]:
d2

name                                                
Bayes     /                                             0.074120
          1-fundamentals/modern-data-scientist.jpg      0.024493
          1-fundamentals/AI-ML-DL-timeline.jpg          0.024418
          1-fundamentals/1.1-intro-to-data-science      0.024116
          search/search_index.json                      0.022157
                                                          ...   
Florence  fundamentals/spreadsheets-overview            0.000117
          environment-setup                             0.000117
          joins                                         0.000117
          distributed-ml/spark-execution-diagram.svg    0.000117
          ml-methodologies-drawing.jpg                  0.000117
Name: path, Length: 1763, dtype: float64

In [91]:
d2.groupby('name').head(5)

name                                                                   
Bayes     /                                                                0.074120
          1-fundamentals/modern-data-scientist.jpg                         0.024493
          1-fundamentals/AI-ML-DL-timeline.jpg                             0.024418
          1-fundamentals/1.1-intro-to-data-science                         0.024116
          search/search_index.json                                         0.022157
Curie     /                                                                0.079329
          6-regression/1-overview                                          0.027571
          search/search_index.json                                         0.024929
          1-fundamentals/modern-data-scientist.jpg                         0.021639
          1-fundamentals/AI-ML-DL-timeline.jpg                             0.021547
Darden    /                                                                0.093081
    

In [None]:
sns.set_style('darkgrid')

x = 
y = [1, 5, 3]

sns.barplot(x, y)
plt.show()