In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.animation as animation
%matplotlib notebook

# CASE STUDY RUANGGURU

In [102]:
data = pd.read_csv('Educontent Data Associate - Case Study.csv')
data.columns

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [103]:
data['Question Created At'] = pd.to_datetime(data['Question Created At'])
data = data.sort_values(by='Question Created At', axis=0)
data.insert(loc=2,
            column='Hour',
            value=data['Question Created At'].dt.hour)
data.insert(loc=3,
            column='Day',
            value=data['Question Created At'].dt.day_name())
data

Unnamed: 0,Question ID,Question Created At,Hour,Day,Subject,Class,Email,PackageType,Created vs answered hour diff
366,AAAAAXZHMB1E,2021-07-19 02:29:03,2,Monday,Matematika,9 SMP,xxxxdisdzz@gmail.com,REGULAR,3
2286,AAAAA0P6CMA5,2021-07-19 07:09:36,7,Monday,Kimia,11 SMA IPA,xxxxwulandari117@gmail.com,PREMIUM,1
1407,AAAAAYELT3N4,2021-07-19 07:17:46,7,Monday,Matematika,5 SD,xxxxitriismail9@gmail.com,PREMIUM,6
2411,AAAAAEGAVJ27,2021-07-19 07:24:48,7,Monday,Kimia,IPA,xxxxsabriana@gmail.com,PREMIUM,0
607,AAAAACG47V7N,2021-07-19 07:25:51,7,Monday,Matematika,11 SMA IPA,xxxxiafransisca00@gmail.com,PREMIUM,1
...,...,...,...,...,...,...,...,...,...
820,AAAAAINR6TYD,2021-08-02 08:58:32,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
1179,AAAAAG2N7D3I,2021-08-02 08:58:54,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
2477,AAAAA39LYOLI,2021-08-02 09:02:13,9,Monday,Bahasa Indonesia,9 SMP,xxxxirahmawati21200704@gmail.com,ELITE,0
1192,AAAAALZ97FR5,2021-08-02 09:02:53,9,Monday,IPA Terpadu,6 SD,xxxxatamimimayla@gmail.com,REGULAR,1


In [104]:
print(f'''
Class --> {data['Class'].unique()}
\nSubject --> {data['Subject'].unique()}
\nPackage --> {data['PackageType'].unique()}
\nTotal Student --> {len(data['Email'].unique())}
''')


Class --> ['9 SMP' '11 SMA IPA' '5 SD' 'IPA' '10 SMA IPA' 'IPS' '4 SD' '7 SMP'
 '3 SD' '8 SMP' '11 SMA IPS' '6 SD' '10 SMA IPS' nan '12 SMA IPS'
 '12 SMA IPA']

Subject --> ['Matematika' 'Kimia' 'Bahasa Inggris' 'Biologi' 'Bahasa Indonesia'
 'Ekonomi' 'IPS Terpadu' 'Fisika' 'Sejarah' 'IPA Terpadu' 'Geografi'
 'Sosiologi']

Package --> ['REGULAR' 'PREMIUM' 'ELITE']

Total Student --> 1246



## Hypothetical Insight

In [105]:
# TIME DEPENDENT
# Rerata pengajar lewat deadline per minggu
# Hari apa aja yg pengajarnya lewat deadline, masing2 brp persentase lewat deadlinenya
# Rerata jumlah pertanyaan per pelajar per minggu nya
# Jam sibuk & Jam yang kurang pengajar
# Tingkat kesibukan per hari per bulan per Subject (line chart) + warna merah utk yg lewat deadline

# SUBJECT DEPENDENT
# Subject yang paling banyak ditanyakan
# Subject yang paling banyak dijawab lewat deadline
# Subject VS Package

# CLASS DEPENDENT
# Class VS Package

# STUDENT DEPENDENT
# Murid yang banyak dikacangin

# TEACHER DEPENDENT
# Total respon yang lewat deadline, persentase dari keseluruhan

# FUN FACT!!!
# Murid yang aktif di tiap Subject
# Data jumlah penanya dari tiap jenjang
# Penanya terbanyak
# Package paling aktif dan yang kureng


# TAMBAHAN
# Kita masih bisa liat perbedaan antar Package, tp disini Elite & Premium dianggap sama

In [106]:
null_data = data[data['Class'].isnull()]

## Querying Time!! 

TEACHER DEPENDENT

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [107]:
# Total respon yang lewat deadline, persentase dari keseluruhan
premium_thr = 24  # 24 hour threshold
regular_thr = 48  # 48 hour threshold

def late_response(package):
    resp = data[data['PackageType'] == package]
    if package == 'PREMIUM' or package == 'ELITE':
        late = resp[resp['Created vs answered hour diff'] >= 24]
        print(f'Failed {package} service --> {len(late)} data')
        return len(late)
    else:
        late = resp[resp['Created vs answered hour diff'] >= 48]
        print(f'Failed {package} service --> {len(late)} data')
        return len(late)
        

# ELITE
el = late_response('ELITE')
# PREMIUM
pr = late_response('PREMIUM')
# REGULAR
re = late_response('REGULAR')

# TOTAL
tot = el + pr + re
print(f'\nTotal service failure --> {tot}')
# Percentage
rate = tot / len(data) * 100
print(f'Rate of service failure --> {round(rate, 2)}%')

Failed ELITE service --> 6 data
Failed PREMIUM service --> 429 data
Failed REGULAR service --> 229 data

Total service failure --> 664
Rate of service failure --> 20.21%


TIME DEPENDENT

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [108]:
data = data.reset_index(drop=True)
data

Unnamed: 0,Question ID,Question Created At,Hour,Day,Subject,Class,Email,PackageType,Created vs answered hour diff
0,AAAAAXZHMB1E,2021-07-19 02:29:03,2,Monday,Matematika,9 SMP,xxxxdisdzz@gmail.com,REGULAR,3
1,AAAAA0P6CMA5,2021-07-19 07:09:36,7,Monday,Kimia,11 SMA IPA,xxxxwulandari117@gmail.com,PREMIUM,1
2,AAAAAYELT3N4,2021-07-19 07:17:46,7,Monday,Matematika,5 SD,xxxxitriismail9@gmail.com,PREMIUM,6
3,AAAAAEGAVJ27,2021-07-19 07:24:48,7,Monday,Kimia,IPA,xxxxsabriana@gmail.com,PREMIUM,0
4,AAAAACG47V7N,2021-07-19 07:25:51,7,Monday,Matematika,11 SMA IPA,xxxxiafransisca00@gmail.com,PREMIUM,1
...,...,...,...,...,...,...,...,...,...
3280,AAAAAINR6TYD,2021-08-02 08:58:32,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
3281,AAAAAG2N7D3I,2021-08-02 08:58:54,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
3282,AAAAA39LYOLI,2021-08-02 09:02:13,9,Monday,Bahasa Indonesia,9 SMP,xxxxirahmawati21200704@gmail.com,ELITE,0
3283,AAAAALZ97FR5,2021-08-02 09:02:53,9,Monday,IPA Terpadu,6 SD,xxxxatamimimayla@gmail.com,REGULAR,1


In [109]:
# Rerata pengajar lewat deadline per minggu
week_list = []
week = 1
for i in data.index:
    week_list.append(week)
    if data.iloc[i]['Day'] == 'Sunday' and data.iloc[i+1]['Day'] == 'Monday':
        week += 1
data.insert(loc = 1,
            column = 'Week',
            value = week_list)

In [110]:
# Late data per week
def late_data(package, x):
    per_week = data[data['Week'] == x]
    resp = per_week[per_week['PackageType'] == package]
    if package == 'PREMIUM' or package == 'ELITE':
        late = resp[resp['Created vs answered hour diff'] >= 24]
        return len(late)
    else:
        late = resp[resp['Created vs answered hour diff'] >= 48]
        return len(late)


# Week 1
week1 = (late_data('ELITE', 1) + 
         late_data('PREMIUM', 1) + 
         late_data('REGULAR', 1)) / len(data[data['Week'] == 1])
# Week 2
week2 = (late_data('ELITE', 2) + 
         late_data('PREMIUM', 2) + 
         late_data('REGULAR', 2)) / len(data[data['Week'] == 2])
# Week 3
week3 = (late_data('ELITE', 3) + 
         late_data('PREMIUM', 3) + 
         late_data('REGULAR', 3)) / len(data[data['Week'] == 3])

# Average
avg_late_per_week = (week1 + week2 + week3) / 3 * 100
print(f'Fail rate per week --> {round(avg_late_per_week, 2)}%')

Fail rate per week --> 19.36%


In [111]:
data

Unnamed: 0,Question ID,Week,Question Created At,Hour,Day,Subject,Class,Email,PackageType,Created vs answered hour diff
0,AAAAAXZHMB1E,1,2021-07-19 02:29:03,2,Monday,Matematika,9 SMP,xxxxdisdzz@gmail.com,REGULAR,3
1,AAAAA0P6CMA5,1,2021-07-19 07:09:36,7,Monday,Kimia,11 SMA IPA,xxxxwulandari117@gmail.com,PREMIUM,1
2,AAAAAYELT3N4,1,2021-07-19 07:17:46,7,Monday,Matematika,5 SD,xxxxitriismail9@gmail.com,PREMIUM,6
3,AAAAAEGAVJ27,1,2021-07-19 07:24:48,7,Monday,Kimia,IPA,xxxxsabriana@gmail.com,PREMIUM,0
4,AAAAACG47V7N,1,2021-07-19 07:25:51,7,Monday,Matematika,11 SMA IPA,xxxxiafransisca00@gmail.com,PREMIUM,1
...,...,...,...,...,...,...,...,...,...,...
3280,AAAAAINR6TYD,3,2021-08-02 08:58:32,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
3281,AAAAAG2N7D3I,3,2021-08-02 08:58:54,8,Monday,Bahasa Inggris,10 SMA IPA,xxxxaulidayunus@gmail.com,REGULAR,6
3282,AAAAA39LYOLI,3,2021-08-02 09:02:13,9,Monday,Bahasa Indonesia,9 SMP,xxxxirahmawati21200704@gmail.com,ELITE,0
3283,AAAAALZ97FR5,3,2021-08-02 09:02:53,9,Monday,IPA Terpadu,6 SD,xxxxatamimimayla@gmail.com,REGULAR,1


In [112]:
# Hari apa aja yg pengajarnya lewat deadline, masing2 brp persentase lewat deadlinenya
def late_df(package):
    resp = data[data['PackageType'] == package]
    if package == 'PREMIUM' or package == 'ELITE':
        late = resp[resp['Created vs answered hour diff'] >= 24]
        return late
    else:
        late = resp[resp['Created vs answered hour diff'] >= 48]
        return late

    
# ELITE
el_late_df = late_df('ELITE')
# PREMIUM
pr_late_df = late_df('PREMIUM')
# REGULAR
re_late_df = late_df('REGULAR')

# Day with late responses
late_response = el_late_df.append(pr_late_df)
late_response = late_response.append(re_late_df)
late_response = late_response.reset_index(drop=True)
late_response['Day'].unique()

array(['Thursday', 'Friday', 'Saturday', 'Monday', 'Tuesday', 'Wednesday',
       'Sunday'], dtype=object)

In [113]:
for r in data['Day'].unique():
    print(f'{r} late response --> {len(late_response[late_response["Day"] == r])}')
    print(len(late_response[late_response["Day"] == r]) /
          len(data[data['Day'] == r]) * 100, '%')

Monday late response --> 27
4.972375690607735 %
Tuesday late response --> 31
6.695464362850973 %
Wednesday late response --> 66
12.571428571428573 %
Thursday late response --> 169
25.375375375375377 %
Friday late response --> 210
38.60294117647059 %
Saturday late response --> 130
44.827586206896555 %
Sunday late response --> 31
12.204724409448819 %


In [114]:
for r in data['Day'].unique():
    print(f'{r} response --> {len(data[data["Day"] == r])}')

Monday response --> 543
Tuesday response --> 463
Wednesday response --> 525
Thursday response --> 666
Friday response --> 544
Saturday response --> 290
Sunday response --> 254


In [115]:
# Rerata jumlah pertanyaan per pelajar per minggu nya
import math
weekly_qna_student = []
weekly_qna = []
for s in data['Email'].unique():
    for w in range(1, 4):
        which_week = data[data['Week'] == w]
        weekly_qna.append(len(which_week[which_week['Email'] == s]))
    weekly_qna_student.append(math.ceil(np.mean(weekly_qna)))
    weekly_qna = []
print(len(weekly_qna_student))
print(f'Weekly question per student --> {np.mean(weekly_qna_student)}')

1246
Weekly question per student --> 1.377207062600321


In [116]:
# Jam sibuk & Jam yang kurang pengajar
# Jam Sibuk
active_hour = data['Hour'].unique()
active_hour.sort()
for h in active_hour:
    load_per_hour = len(data[data['Hour'] == h])
    print(f'At {h} --> {load_per_hour} questions')

At 0 --> 12 questions
At 1 --> 1 questions
At 2 --> 1 questions
At 3 --> 4 questions
At 4 --> 7 questions
At 5 --> 33 questions
At 6 --> 46 questions
At 7 --> 168 questions
At 8 --> 284 questions
At 9 --> 339 questions
At 10 --> 306 questions
At 11 --> 254 questions
At 12 --> 246 questions
At 13 --> 212 questions
At 14 --> 165 questions
At 15 --> 149 questions
At 16 --> 122 questions
At 17 --> 96 questions
At 18 --> 131 questions
At 19 --> 170 questions
At 20 --> 201 questions
At 21 --> 192 questions
At 22 --> 96 questions
At 23 --> 50 questions


In [117]:
# Jam yang kurang pengajar
chaos_hour = late_response['Hour'].unique()
chaos_hour.sort()
for h in chaos_hour:
    chaos_load = len(late_response[late_response['Hour'] == h])
    tot_load = len(data[data['Hour'] == h])
    print(f'At {h} --> {chaos_load} questions || {round(chaos_load / tot_load * 100, 2)}%')

At 0 --> 5 questions || 41.67%
At 4 --> 2 questions || 28.57%
At 5 --> 3 questions || 9.09%
At 6 --> 4 questions || 8.7%
At 7 --> 31 questions || 18.45%
At 8 --> 57 questions || 20.07%
At 9 --> 68 questions || 20.06%
At 10 --> 57 questions || 18.63%
At 11 --> 55 questions || 21.65%
At 12 --> 54 questions || 21.95%
At 13 --> 50 questions || 23.58%
At 14 --> 31 questions || 18.79%
At 15 --> 35 questions || 23.49%
At 16 --> 24 questions || 19.67%
At 17 --> 26 questions || 27.08%
At 18 --> 24 questions || 18.32%
At 19 --> 30 questions || 17.65%
At 20 --> 45 questions || 22.39%
At 21 --> 34 questions || 17.71%
At 22 --> 20 questions || 20.83%
At 23 --> 9 questions || 18.0%


In [118]:
# Tingkat kesibukan per hari per minggu per Subject (line chart) + warna merah utk yg lewat deadline
data.insert(loc=3,
            column='Date',
            value=data['Question Created At'].dt.date)

In [119]:
load_daily = []
for d in data['Date'].unique():
    daily = data[data['Date'] == d]
    load_daily.append(len(daily))
load_daily

[161, 67, 191, 253, 253, 123, 110, 337, 396, 334, 413, 291, 167, 144, 45]

In [120]:
sub_load = {}
load_list = []
for s in data['Subject'].unique():
    sub_data = data[data['Subject'] == s]
    for d in data['Date'].unique():
        sub_daily = sub_data[sub_data['Date'] == d]
        load_list.append(len(sub_daily))
    sub_load[s] = load_list
    load_list = []
sub_load

{'Matematika': [59,
  34,
  85,
  103,
  92,
  43,
  39,
  146,
  170,
  135,
  158,
  125,
  75,
  45,
  7],
 'Kimia': [20, 11, 18, 28, 37, 23, 13, 37, 48, 59, 60, 38, 19, 19, 8],
 'Bahasa Inggris': [20, 4, 8, 13, 11, 9, 4, 10, 23, 16, 39, 23, 4, 31, 9],
 'Biologi': [14, 8, 33, 30, 32, 7, 21, 51, 48, 33, 38, 22, 17, 15, 4],
 'Bahasa Indonesia': [6, 2, 9, 20, 13, 8, 3, 30, 16, 18, 15, 20, 13, 6, 3],
 'Ekonomi': [3, 0, 1, 4, 5, 1, 2, 8, 14, 3, 5, 6, 0, 1, 2],
 'IPS Terpadu': [3, 0, 1, 1, 4, 0, 1, 1, 1, 0, 3, 0, 1, 0, 0],
 'Fisika': [27, 3, 26, 30, 34, 19, 21, 39, 49, 46, 65, 39, 23, 19, 8],
 'Sejarah': [5, 0, 4, 2, 12, 9, 1, 4, 6, 12, 16, 4, 5, 5, 2],
 'IPA Terpadu': [2, 0, 3, 3, 5, 0, 2, 3, 11, 7, 2, 4, 3, 0, 1],
 'Geografi': [2, 5, 3, 17, 8, 3, 1, 6, 9, 4, 11, 6, 6, 2, 1],
 'Sosiologi': [0, 0, 0, 2, 0, 1, 2, 2, 1, 1, 1, 4, 1, 1, 0]}

SUBJECT DEPENDENT

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [121]:
# Subject yang paling banyak ditanyakan
sub_tot = {}
for s in data['Subject'].unique():
    sub_df = data[data['Subject'] == s]
    sub_tot[s] = len(sub_df)
dict(sorted(sub_tot.items(),
            key=lambda item: item[1],
            reverse=True))

{'Matematika': 1316,
 'Fisika': 448,
 'Kimia': 438,
 'Biologi': 373,
 'Bahasa Inggris': 224,
 'Bahasa Indonesia': 182,
 'Sejarah': 87,
 'Geografi': 84,
 'Ekonomi': 55,
 'IPA Terpadu': 46,
 'IPS Terpadu': 16,
 'Sosiologi': 16}

In [122]:
# Subject yang paling banyak dijawab lewat deadline
sub_late = {}
for s in data['Subject'].unique():
    sublate_df = late_response[late_response['Subject'] == s]
    sub_late[s] = len(sublate_df)
dict(sorted(sub_late.items(),
            key=lambda item: item[1],
            reverse=True))

{'Matematika': 423,
 'Fisika': 124,
 'Sejarah': 27,
 'Bahasa Inggris': 20,
 'Kimia': 19,
 'Bahasa Indonesia': 19,
 'Geografi': 12,
 'IPS Terpadu': 5,
 'Biologi': 4,
 'IPA Terpadu': 4,
 'Sosiologi': 4,
 'Ekonomi': 3}

In [123]:
# Subject VS Package
sp = {}
for p in data['PackageType'].unique():
    print(p)
    sub_in_pack = data[data['PackageType'] == p]
    for s in data['Subject'].unique():
        sub_p = sub_in_pack[sub_in_pack['Subject'] == s]
        sp[s] = len(sub_p)
    sp = dict(sorted(sp.items(),
                     key=lambda item: item[1],
                     reverse=True))
    print(sp)
    sp = {}

REGULAR
{'Matematika': 708, 'Biologi': 254, 'Fisika': 240, 'Kimia': 228, 'Bahasa Inggris': 168, 'Bahasa Indonesia': 103, 'Sejarah': 50, 'Geografi': 41, 'Ekonomi': 32, 'IPA Terpadu': 26, 'IPS Terpadu': 7, 'Sosiologi': 7}
PREMIUM
{'Matematika': 589, 'Kimia': 205, 'Fisika': 204, 'Biologi': 115, 'Bahasa Indonesia': 76, 'Bahasa Inggris': 54, 'Geografi': 42, 'Sejarah': 37, 'Ekonomi': 23, 'IPA Terpadu': 20, 'IPS Terpadu': 9, 'Sosiologi': 9}
ELITE
{'Matematika': 19, 'Kimia': 5, 'Biologi': 4, 'Fisika': 4, 'Bahasa Indonesia': 3, 'Bahasa Inggris': 2, 'Geografi': 1, 'Ekonomi': 0, 'IPS Terpadu': 0, 'Sejarah': 0, 'IPA Terpadu': 0, 'Sosiologi': 0}


CLASS DEPENDENT

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [126]:
# Class VS Package
for c in data['Class'].unique():
    class_list = data[data['Class'] == c]['PackageType']
    print(f'{c} --> mostly {class_list.mode()}')

9 SMP --> mostly 0    REGULAR
dtype: object
11 SMA IPA --> mostly 0    REGULAR
dtype: object
5 SD --> mostly 0    REGULAR
dtype: object
IPA --> mostly 0    PREMIUM
dtype: object
10 SMA IPA --> mostly 0    REGULAR
dtype: object
IPS --> mostly 0    REGULAR
dtype: object
4 SD --> mostly 0    REGULAR
dtype: object
7 SMP --> mostly 0    REGULAR
dtype: object
3 SD --> mostly 0    PREMIUM
dtype: object
8 SMP --> mostly 0    REGULAR
dtype: object
11 SMA IPS --> mostly 0    PREMIUM
dtype: object
6 SD --> mostly 0    PREMIUM
dtype: object
10 SMA IPS --> mostly 0    PREMIUM
dtype: object
nan --> mostly Series([], dtype: object)
12 SMA IPS --> mostly 0    PREMIUM
dtype: object
12 SMA IPA --> mostly 0    PREMIUM
dtype: object


In [127]:
# 12 SMA IPA --> mostly     PREMIUM
# 12 SMA IPS --> mostly     PREMIUM
# 11 SMA IPA --> mostly     REGULAR
# 11 SMA IPS --> mostly     PREMIUM
# 10 SMA IPA --> mostly     REGULAR
# 10 SMA IPS --> mostly     PREMIUM
# 9 SMP --> mostly     REGULAR
# 8 SMP --> mostly     REGULAR
# 7 SMP --> mostly     REGULAR
# 6 SD --> mostly     PREMIUM
# 5 SD --> mostly     REGULAR
# 4 SD --> mostly     REGULAR
# 3 SD --> mostly     PREMIUM
# IPA --> mostly     PREMIUM
# IPS --> mostly     REGULAR

STUDENT DEPENDENT

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [128]:
# Murid yang banyak dikacangin
unresp_student = {}
for s in late_response['Email'].unique():
    fail_resp = late_response[late_response['Email'] == s]
    unresp_student[s] = len(fail_resp)
unresp_student = dict(sorted(unresp_student.items(),
                             key=lambda item: item[1],
                             reverse=True))
unresp_student

{'xxxxnnypratiwi@civil.untan.ac.id': 9,
 'xxxxah_nurlidistia7@gmail.com': 9,
 'xxxxanamom@gmail.com': 6,
 'xxxxik601@gmail.com': 6,
 'xxxxlfahlevi810@gmail.com': 5,
 'xxxxicadeanna123@gmail.com': 5,
 'xxxxkla2009@gmail.com': 5,
 'xxxxllns.oce@gmail.com': 5,
 'xxxxabillahhu@gmail.com': 4,
 'xxxxsetyanegari@gmail.com': 4,
 'xxxxeltheodorusw@gmail.com': 4,
 'xxxxihamidah494@gmail.com': 4,
 'xxxxnella5@gmail.com': 4,
 'xxxx.wisnu.sw@gmail.com': 4,
 'xxxxtianolubis@gmail.com': 4,
 'xxxxiskandar05@gmail.com': 3,
 'xxxxnisaadilla28@gmail.com': 3,
 'xxxxelprasetya@gmail.com': 3,
 'xxxxanatasya08@gmail.com': 3,
 'xxxxa5723@gmail.com': 3,
 'xxxxlicaruangguru@gmail.com': 3,
 'xxxxan4776@gmail.com': 3,
 'xxxxaribuannatanael27@gmail.com': 3,
 'xxxxsh.calya131@gmail.com': 3,
 'xxxxikafathan16@gmail.com': 3,
 'xxxxnorra7114@gmail.com': 3,
 'xxxxyuashazi@gmail.com': 3,
 'xxxxkmila66@gmail.com': 3,
 'xxxxaaath15@gmail.com': 3,
 'xxxxanandadiva@gmail.com': 3,
 'xxxxagraceila11@gmail.com': 3,
 'xxxxa.d05

FUN FACT!!

Index(['Question ID', 'Question Created At', 'Subject', 'Class', 'Email',
       'PackageType', 'Created vs answered hour diff'],
      dtype='object')

In [129]:
# Murid yang aktif di tiap Subject
stud_sub = {}
for s in data['Subject'].unique():
    spec_sub = data[data['Subject'] == s]
    stud_sub[s] = spec_sub['Email'].mode()[0]
stud_sub

{'Matematika': 'xxxxaulida@gmail.com',
 'Kimia': 'xxxxlcaesar12@gmail.com',
 'Bahasa Inggris': 'xxxxntinacahaya232@gmail.com',
 'Biologi': 'xxxxahengie@gmail.com',
 'Bahasa Indonesia': 'xxxxaschy@gmail.com',
 'Ekonomi': 'xxxx0509@gmail.com',
 'IPS Terpadu': 'xxxxaassyifa@gmail.com',
 'Fisika': 'xxxxiatazkiyah266@gmail.com',
 'Sejarah': 'xxxxraputri.7d@gmail.com',
 'IPA Terpadu': 'xxxxanamom@gmail.com',
 'Geografi': 'xxxxa5723@gmail.com',
 'Sosiologi': 'xxxxadestya13@gmail.com'}

In [130]:
# Data jumlah penanya dari tiap jenjang
class_stud = {}
for c in data['Class'].unique():
    class_list = data[data['Class'] == c]
    class_stud[c] = len(class_list)
class_stud = dict(sorted(class_stud.items(),
                         key=lambda item: item[1],
                         reverse=True))
class_stud

{'IPA': 945,
 '11 SMA IPA': 613,
 '10 SMA IPA': 524,
 '9 SMP': 467,
 '8 SMP': 178,
 'IPS': 131,
 '7 SMP': 81,
 '11 SMA IPS': 80,
 '4 SD': 78,
 '6 SD': 66,
 '5 SD': 53,
 '10 SMA IPS': 47,
 '12 SMA IPS': 7,
 '3 SD': 1,
 '12 SMA IPA': 1,
 nan: 0}

In [131]:
# Penanya terbanyak
data['Email'].mode()

0    xxxxntinacahaya232@gmail.com
dtype: object

In [132]:
# Package paling aktif dan yang kureng
pack_act = {}
for p in data['PackageType'].unique():
    spec_pack = data[data['PackageType'] == p]
    pack_act[p] = len(spec_pack)
pack_act = dict(sorted(pack_act.items(),
                       key=lambda item: item[1],
                       reverse=True))
pack_act

{'REGULAR': 1864, 'PREMIUM': 1383, 'ELITE': 38}