# basics

In [101]:
import pandas as pd

In [102]:
df = pd.read_csv('data/basics.csv')
df.head()

Unnamed: 0,gender,age,batch,avg_internet_usage,usage_type
0,Female,21 to 25,2017/ Final year,4 to 6 hours per day,Online learning
1,Male,21 to 25,2017/ Final year,4 to 6 hours per day,Information gathering for academic purposes
2,Male,21 to 25,2017/ Final year,6 to 8 hours per day,Online learning
3,Female,21 to 25,2017/ Final year,4 to 6 hours per day,Online learning
4,Female,21 to 25,2017/ Final year,4 to 6 hours per day,Lecture videos


In [103]:
df.shape

(194, 5)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   gender              194 non-null    object
 1   age                 194 non-null    object
 2   batch               194 non-null    object
 3   avg_internet_usage  194 non-null    object
 4   usage_type          194 non-null    object
dtypes: object(5)
memory usage: 7.7+ KB


no null values.

In [105]:
df['gender'].value_counts()

gender
Male      117
Female     77
Name: count, dtype: int64

there is a difference in absolute gender numbers. percentages more meaningful for comparison.

In [106]:
df['age'].value_counts()

age
18 to 21    129
21 to 25     65
Name: count, dtype: int64

In [107]:
df['batch'].value_counts()

batch
2020/ First year    139
2017/ Final year     55
Name: count, dtype: int64

there is a marked difference between the number of participants from each group. so percentages are more meaningful than absolute numbers. 

In [108]:
df['avg_internet_usage'].value_counts()

avg_internet_usage
4 to 6 hours per day         61
2 to 4 hours per day         58
6 to 8 hours per day         30
0 to 2 hours per day         28
more than 8 hours per day    17
Name: count, dtype: int64

In [109]:
gender_dict = {'Male': 'M', 'Female': 'F'}
batch_dict = {'2020/ First year': 'First', '2017/ Final year': 'Final'}

df['gender'] = df['gender'].replace(gender_dict)
df['batch'] = df['batch'].replace(batch_dict)

df.head()

Unnamed: 0,gender,age,batch,avg_internet_usage,usage_type
0,F,21 to 25,Final,4 to 6 hours per day,Online learning
1,M,21 to 25,Final,4 to 6 hours per day,Information gathering for academic purposes
2,M,21 to 25,Final,6 to 8 hours per day,Online learning
3,F,21 to 25,Final,4 to 6 hours per day,Online learning
4,F,21 to 25,Final,4 to 6 hours per day,Lecture videos


## R1: Average daily internet usage (combined and batch responses)

In [110]:
r1_table = df.groupby('avg_internet_usage')[['gender', 'batch']].value_counts().unstack(level=[-1, -2]).fillna(0).astype(int)
r1_table

batch,First,Final,First,Final
gender,M,M,F,F
avg_internet_usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0 to 2 hours per day,21,4,2,1
2 to 4 hours per day,29,0,22,7
4 to 6 hours per day,26,8,16,11
6 to 8 hours per day,9,7,5,9
more than 8 hours per day,9,4,0,4


In [111]:
r1_table.columns

MultiIndex([('First', 'M'),
            ('Final', 'M'),
            ('First', 'F'),
            ('Final', 'F')],
           names=['batch', 'gender'])

In [112]:
r1_table[('Combined', 'M')] = r1_table[('First', 'M')] + r1_table[('Final', 'M')]
r1_table[('Combined', 'F')] = r1_table[('First', 'F')] + r1_table[('Final', 'F')]
r1_table[('Combined', 'T')] = r1_table[('Combined', 'M')] + r1_table[('Combined', 'F')]
r1_table[('First', 'T')] = r1_table[('First', 'M')] + r1_table[('First', 'F')]
r1_table[('Final', 'T')] = r1_table[('Final', 'M')] + r1_table[('Final', 'F')]
r1_table 

batch,First,Final,First,Final,Combined,Combined,Combined,First,Final
gender,M,M,F,F,M,F,T,T,T
avg_internet_usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0 to 2 hours per day,21,4,2,1,25,3,28,23,5
2 to 4 hours per day,29,0,22,7,29,29,58,51,7
4 to 6 hours per day,26,8,16,11,34,27,61,42,19
6 to 8 hours per day,9,7,5,9,16,14,30,14,16
more than 8 hours per day,9,4,0,4,13,4,17,9,8


In [113]:
for col in r1_table.columns:
    r1_table[(col[0], f'{col[1]} (%)')] = round(r1_table[col] / r1_table[col].sum() * 100, 2)
r1_table 

batch,First,Final,First,Final,Combined,Combined,Combined,First,Final,First,Final,First,Final,Combined,Combined,Combined,First,Final
gender,M,M,F,F,M,F,T,T,T,M (%),M (%),F (%),F (%),M (%),F (%),T (%),T (%),T (%)
avg_internet_usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0 to 2 hours per day,21,4,2,1,25,3,28,23,5,22.34,17.39,4.44,3.12,21.37,3.9,14.43,16.55,9.09
2 to 4 hours per day,29,0,22,7,29,29,58,51,7,30.85,0.0,48.89,21.88,24.79,37.66,29.9,36.69,12.73
4 to 6 hours per day,26,8,16,11,34,27,61,42,19,27.66,34.78,35.56,34.38,29.06,35.06,31.44,30.22,34.55
6 to 8 hours per day,9,7,5,9,16,14,30,14,16,9.57,30.43,11.11,28.12,13.68,18.18,15.46,10.07,29.09
more than 8 hours per day,9,4,0,4,13,4,17,9,8,9.57,17.39,0.0,12.5,11.11,5.19,8.76,6.47,14.55


In [114]:
r1_table.loc['Total'] = r1_table.sum()
r1_table 

batch,First,Final,First,Final,Combined,Combined,Combined,First,Final,First,Final,First,Final,Combined,Combined,Combined,First,Final
gender,M,M,F,F,M,F,T,T,T,M (%),M (%),F (%),F (%),M (%),F (%),T (%),T (%),T (%)
avg_internet_usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0 to 2 hours per day,21.0,4.0,2.0,1.0,25.0,3.0,28.0,23.0,5.0,22.34,17.39,4.44,3.12,21.37,3.9,14.43,16.55,9.09
2 to 4 hours per day,29.0,0.0,22.0,7.0,29.0,29.0,58.0,51.0,7.0,30.85,0.0,48.89,21.88,24.79,37.66,29.9,36.69,12.73
4 to 6 hours per day,26.0,8.0,16.0,11.0,34.0,27.0,61.0,42.0,19.0,27.66,34.78,35.56,34.38,29.06,35.06,31.44,30.22,34.55
6 to 8 hours per day,9.0,7.0,5.0,9.0,16.0,14.0,30.0,14.0,16.0,9.57,30.43,11.11,28.12,13.68,18.18,15.46,10.07,29.09
more than 8 hours per day,9.0,4.0,0.0,4.0,13.0,4.0,17.0,9.0,8.0,9.57,17.39,0.0,12.5,11.11,5.19,8.76,6.47,14.55
Total,94.0,23.0,45.0,32.0,117.0,77.0,194.0,139.0,55.0,99.99,99.99,100.0,100.0,100.01,99.99,99.99,100.0,100.01


In [115]:
r1_table = r1_table.sort_index(axis=1, level=[0,1], ascending=[False, True])
r1_table 

batch,First,First,First,First,First,First,Final,Final,Final,Final,Final,Final,Combined,Combined,Combined,Combined,Combined,Combined
gender,F,F (%),M,M (%),T,T (%),F,F (%),M,M (%),T,T (%),F,F (%),M,M (%),T,T (%)
avg_internet_usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0 to 2 hours per day,2.0,4.44,21.0,22.34,23.0,16.55,1.0,3.12,4.0,17.39,5.0,9.09,3.0,3.9,25.0,21.37,28.0,14.43
2 to 4 hours per day,22.0,48.89,29.0,30.85,51.0,36.69,7.0,21.88,0.0,0.0,7.0,12.73,29.0,37.66,29.0,24.79,58.0,29.9
4 to 6 hours per day,16.0,35.56,26.0,27.66,42.0,30.22,11.0,34.38,8.0,34.78,19.0,34.55,27.0,35.06,34.0,29.06,61.0,31.44
6 to 8 hours per day,5.0,11.11,9.0,9.57,14.0,10.07,9.0,28.12,7.0,30.43,16.0,29.09,14.0,18.18,16.0,13.68,30.0,15.46
more than 8 hours per day,0.0,0.0,9.0,9.57,9.0,6.47,4.0,12.5,4.0,17.39,8.0,14.55,4.0,5.19,13.0,11.11,17.0,8.76
Total,45.0,100.0,94.0,99.99,139.0,100.0,32.0,100.0,23.0,99.99,55.0,100.01,77.0,99.99,117.0,100.01,194.0,99.99


The percentages for the 'Total' row in the original report were quite confusing.