In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
responses_df = pd.read_csv('raw_data/responses.csv', parse_dates=['date'])
master_schools_df = pd.read_csv('raw_data/master_schools.csv')
assmt_recipient_df = pd.read_csv('raw_data/assignment_recipients.csv')
assmt_activities_df = pd.read_csv('raw_data/baseline_activities.csv')
users_df = pd.read_csv('raw_data/siyavula_users.csv')
user_details_df = pd.read_csv('raw_data/siyavula_user_details.csv', parse_dates=['created_at', 'grade_confirmed_at'])
baseline_questions_df = pd.read_csv('raw_data/baseline_questions.csv')
assignment_df = pd.read_csv('raw_data/assignments.csv', parse_dates=['created_at'])

## Cleaning the schools user data

We need first to create a gauteng province schools dataset.

In [3]:
gauteng_schools_df = master_schools_df[['school_uuid']][
    (master_schools_df.country == 'South Africa') & (master_schools_df.province_state == 'Gauteng')]
gauteng_schools_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2929 entries, 1 to 27786
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   school_uuid  2155 non-null   object
dtypes: object(1)
memory usage: 45.8+ KB


We then need to combine (union) guateng schools dataset and the schools that recieved any assignment that contained the baseline questions to create our final dataset of schools

In [4]:
# View the details of the assmt_recipient_df dataset
assmt_recipient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2005 entries, 0 to 2004
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   school_uuid  2005 non-null   object
dtypes: object(1)
memory usage: 15.8+ KB


Combine the two sets of school data to get our school dataset.

In [5]:
# union the to datasets 
schools_df = pd.concat([assmt_recipient_df, gauteng_schools_df], ignore_index=True, verify_integrity=True)
# Remove all null values
schools_df.dropna(inplace=True)
# drop any duplicates in the data
schools_df.drop_duplicates(inplace=True)
schools_df.shape

(3491, 1)

In [6]:
schools_details_df = schools_df.merge(master_schools_df, how='left', on='school_uuid')
schools_details_df.dropna(subset=['master_school_id'], inplace=True)
print(schools_details_df.info())
schools_details_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 0 to 3490
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   school_uuid       2200 non-null   object 
 1   master_school_id  2200 non-null   float64
 2   school            2200 non-null   object 
 3   country           2200 non-null   object 
 4   province_state    2200 non-null   object 
 5   district          2131 non-null   object 
 6   city_town         2198 non-null   object 
 7   quintile          1550 non-null   float64
 8   urban_rural       2116 non-null   object 
 9   sector            2145 non-null   object 
 10  gis_longitude     2050 non-null   float64
 11  gis_latitude      2050 non-null   float64
dtypes: float64(4), object(8)
memory usage: 223.4+ KB
None


Unnamed: 0,school_uuid,master_school_id,school,country,province_state,district,city_town,quintile,urban_rural,sector,gis_longitude,gis_latitude
0,002c567f-af10-4a61-bb2d-834cec705e73,20716.0,Eldomaine Secondary School (Johannesburg),South Africa,Gauteng,Johannesburg Central,Johannesburg,4.0,Urban,Public,27.907336,-26.295141
1,002d436b-7b3c-4a5d-a269-0b3f63582b92,387.0,Semphato Junior Secondary School (Pretoria),South Africa,Gauteng,Tshwane North,Pretoria,1.0,Rural,Public,28.088582,-25.456325
2,007f47e1-cb4e-4aef-83fc-16a8b81467ec,20810.0,Lekoa Shandu Secondary School (Vereeniging),South Africa,Gauteng,Sedibeng East,Vereeniging,3.0,Urban,Public,27.868147,-26.692765
5,00c15e79-c9d8-408c-8a21-73a6962f48c1,4186.0,Lucas Motshabanosi Secondary School (Pretoria),South Africa,Gauteng,Tshwane West,Pretoria,1.0,Urban,Public,28.075573,-25.485676
7,00f9268b-b396-498d-a393-c0c959ba71b2,20866.0,Mom Sebone Secondary School (Nigel),South Africa,Gauteng,Gauteng East,Nigel,4.0,Urban,Public,28.397135,-26.376486


## Cleaning Users

First, lets get all the unique users in the responses data

In [7]:
# this are the unique users who have ever responded to the baseline questions
respondents = responses_df[['user_uuid']].drop_duplicates()
respondents.shape

(35936, 1)

Let's get the school details of all users who have responded to any of our baseline questions

In [8]:
# get the unique users from the responses
respondents_details = respondents.merge(
    user_details_df[['user_uuid', 'master_school_id']], how='left', on='user_uuid').drop_duplicates()
respondents_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35936 entries, 0 to 35935
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_uuid         35936 non-null  object 
 1   master_school_id  35303 non-null  float64
dtypes: float64(1), object(1)
memory usage: 842.2+ KB


Now Let us get the grade 8 students (user) from gauteng province, south africa in the year of interest (2022).
Those who had ```grade_confirmed_at``` after *2022-01-01*

In [9]:
# Get all the grade 8 users location details  in data in the year confirmed in 2022
grade_8_users = user_details_df[(user_details_df.grade_confirmed_at >= pd.Timestamp(2022,1,1))
                                & (user_details_df.grade == 8)]
grade_8_users_details = grade_8_users.drop_duplicates().merge(
    schools_details_df[['master_school_id', 'country', 'province_state', 'district']], how='left', on='master_school_id')
grade_8_users_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223323 entries, 0 to 223322
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   user_uuid           223323 non-null  object        
 1   master_school_id    217553 non-null  float64       
 2   grade               223323 non-null  float64       
 3   grade_confirmed_at  223323 non-null  datetime64[ns]
 4   created_at          223323 non-null  datetime64[ns]
 5   country             202830 non-null  object        
 6   province_state      202830 non-null  object        
 7   district            199771 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(4)
memory usage: 15.3+ MB


Now, with that we have all grade 8 users in the year 2022, lets narrow those to gauteng province, South Africa (our interest)

In [10]:
# Get the gauteng grade 8 users by filtering the by Gauteng province
gauteng_grade_8_user_details = grade_8_users_details[
    (grade_8_users_details.country == 'South Africa') & (grade_8_users_details.province_state == 'Gauteng')]
gauteng_grade_8_user_details = gauteng_grade_8_user_details.drop_duplicates()
print(gauteng_grade_8_user_details.info())
gauteng_grade_8_user_details.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198845 entries, 0 to 223322
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   user_uuid           198845 non-null  object        
 1   master_school_id    198845 non-null  float64       
 2   grade               198845 non-null  float64       
 3   grade_confirmed_at  198845 non-null  datetime64[ns]
 4   created_at          198845 non-null  datetime64[ns]
 5   country             198845 non-null  object        
 6   province_state      198845 non-null  object        
 7   district            196792 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(4)
memory usage: 13.7+ MB
None


Unnamed: 0,user_uuid,master_school_id,grade,grade_confirmed_at,created_at,country,province_state,district
0,00002dda-9569-4382-a46e-42e48b38eb08,24890.0,8.0,2022-01-25,2022-01-25,South Africa,Gauteng,Gauteng West
1,00003e07-54aa-4639-982a-7e3ab01a0302,24921.0,8.0,2022-01-25,2022-01-24,South Africa,Gauteng,Sedibeng West


Now Let us get all gauteng student user data as users

In [11]:
gauteng_grade_8_users = gauteng_grade_8_user_details[['user_uuid', 'master_school_id']]
gauteng_grade_8_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198845 entries, 0 to 223322
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   user_uuid         198845 non-null  object 
 1   master_school_id  198845 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.6+ MB


In [12]:
gauteng_grade_8_users.user_uuid.nunique()

198845

Finally, lets create a dataset that has all our gauteng users and all users who respondend to the basline questions by union our to datasets.

In [13]:
# Combine the guateng grade 8 users and assessed users to have our users
students_df = pd.concat([gauteng_grade_8_users, respondents_details], ignore_index=True, verify_integrity=True)
students_df.drop_duplicates(inplace=True)
print(users_df.info())
users_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1691554 entries, 0 to 1691553
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uuid    object
dtypes: object(1)
memory usage: 12.9+ MB
None


Unnamed: 0,uuid
0,afa1a5b9-6dbc-409a-a3d0-33ca50c82f1e
1,0cf12f76-2991-4758-8388-af4271f47cce
2,5194a4e7-0dcd-47b8-b383-9a352e3f6dcc
3,a90ed9b5-1761-4567-96fd-bbdb31b92a0a
4,8a5f7ab9-5ed8-4460-9248-16a626c22c21


## Cleaning the responses

In [14]:
# View the responses details and a snapshot of the data
print(responses_df.info())
responses_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568000 entries, 0 to 567999
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   uuid           568000 non-null  object        
 1   date           568000 non-null  datetime64[ns]
 2   question_id    568000 non-null  int64         
 3   user_uuid      568000 non-null  object        
 4   template_id    568000 non-null  int64         
 5   section_id     568000 non-null  int64         
 6   activity_uuid  568000 non-null  object        
 7   assignment_id  568000 non-null  int64         
 8   difficulty     568000 non-null  int64         
 9   attempted      568000 non-null  bool          
 10  correctness    548619 non-null  object        
 11  durations      568000 non-null  object        
 12  marks          568000 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(5), object(6)
memory usage: 52.5+ MB
None


Unnamed: 0,uuid,date,question_id,user_uuid,template_id,section_id,activity_uuid,assignment_id,difficulty,attempted,correctness,durations,marks
0,a8ef0c68-f433-478c-b736-1896b03c86a4,2022-04-06,400878,6d1a76e8-ccc2-4f59-abf3-fd70a0583c58,1762,204,f33a184f-cd3d-4a88-91bd-3917b83fe9c3,22246,1,False,,[],[]
1,9c202e33-0a12-46aa-b0ac-9d1cc5f723f4,2018-06-04,400879,30b919c9-b9bd-4fd1-8b01-798b9795022d,1766,230,d47421aa-0095-4e31-9ddc-2227c5a680ed,1888,2,True,True,[42],"[[1, 1]]"


In [15]:
# clean the response durations
def clean_durations(row):
    d = row.replace("[", '').replace("]", '').split(',')
    d = [int(i) for i in d if i != '']
    if not d:
        return np.nan
    return min(d)  
responses_df.durations = responses_df.durations.apply(clean_durations)

In [16]:
# clean the response marks
def clean_durations(row):
    d = row.replace("[", '').replace("]", '').split(',')
    d = [float(i) for i in d if i != '']
    if not d:
        return np.nan
    return d[0] 
responses_df.marks = responses_df.marks.apply(clean_durations)

In [17]:
# Replace the value in attempted and correctness with integer
responses_df.attempted = responses_df.attempted.astype('float')
responses_df.correctness = responses_df.correctness.astype('float')
responses_df.head()

Unnamed: 0,uuid,date,question_id,user_uuid,template_id,section_id,activity_uuid,assignment_id,difficulty,attempted,correctness,durations,marks
0,a8ef0c68-f433-478c-b736-1896b03c86a4,2022-04-06,400878,6d1a76e8-ccc2-4f59-abf3-fd70a0583c58,1762,204,f33a184f-cd3d-4a88-91bd-3917b83fe9c3,22246,1,0.0,,,
1,9c202e33-0a12-46aa-b0ac-9d1cc5f723f4,2018-06-04,400879,30b919c9-b9bd-4fd1-8b01-798b9795022d,1766,230,d47421aa-0095-4e31-9ddc-2227c5a680ed,1888,2,1.0,1.0,42.0,1.0
2,a26fd11a-ceaf-41a1-b1da-4e3c7cff70b9,2018-06-04,400879,a409f06c-e205-4a61-bb0e-7c14682c1b48,1766,230,a43ef5f4-cc7b-454b-8b6c-345e4606bedc,1888,2,1.0,1.0,15.0,1.0
3,8027090b-511d-4c27-bb13-08869767d137,2018-06-04,400882,a409f06c-e205-4a61-bb0e-7c14682c1b48,1731,254,a43ef5f4-cc7b-454b-8b6c-345e4606bedc,1888,1,1.0,1.0,56.0,2.0
4,bab0cc8c-34d5-411b-a30e-9aca14059e15,2018-06-05,400883,30b919c9-b9bd-4fd1-8b01-798b9795022d,1727,254,d47421aa-0095-4e31-9ddc-2227c5a680ed,1888,4,1.0,1.0,52.0,2.0


Let us all responses that were made before 2022, and also the baseline assessment in the year 2022. Any other responses that were made in the year 2022 that belong to other assignments will be discarded

In [18]:
pd.Timestamp(2022,1,1).date()

datetime.date(2022, 1, 1)

In [19]:
responses_df = responses_df[
    (responses_df.date < pd.Timestamp(2022,1,1)) | (responses_df.assignment_id == 22620)]
responses_df.shape

(472943, 13)

Now We need to group our responses by ```date```, ```question_id```, ```user_uuid```, ```template_id```, ```section_id```, ```activity_uuid```, ```assignment_id```.
We will aggregate the others as follows:
- ```attempted``` and ```correctness``` with sum
- ```duration``` and ```marks``` with average
- ```difficulty``` with minimum
- ```uuid``` with count

In [20]:
columns_to_groupby = ['date', 'question_id', 'user_uuid', 'template_id', 'section_id', 'activity_uuid', 'assignment_id']
aggregations = {
    'attempted': 'sum',
    'correctness': 'sum',
    'uuid': 'count',
    'marks': 'mean',
    'durations': 'min',
    'difficulty': 'min'}
responses_summ_df = responses_df.groupby(columns_to_groupby).agg(aggregations).reset_index()
# rename the columns
renamed_columns = {
    'attempted': 'n_attempts',
    'correctness': 'n_correct_responses',
    'uuid': 'n_responses',
    'marks': 'avg_marks',
    'durations': 'best_response_durations',
    'difficulty': 'difficulty'
}

responses_summ_df.rename(columns=renamed_columns, inplace=True)

print(responses_df.shape, responses_summ_df.shape)
responses_summ_df.head()

(472943, 13) (335110, 13)


Unnamed: 0,date,question_id,user_uuid,template_id,section_id,activity_uuid,assignment_id,n_attempts,n_correct_responses,n_responses,avg_marks,best_response_durations,difficulty
0,2018-02-12,400883,8f696f09-5c0e-4ac5-add6-a8f486724173,1727,254,205cf487-1810-4d0f-b96f-e5bcd2b798f2,530,1.0,1.0,1,2.0,16.0,4
1,2018-02-12,400885,186bfae4-d8fd-47ba-b894-3200e8280dee,2647,228,cfd4b951-b4c6-48fc-a950-ad668d1398ab,552,1.0,1.0,1,1.0,17.0,2
2,2018-02-12,400885,47e1ae12-77fd-4576-87a3-a7c108cfabd4,2647,228,bc6fe40a-59f8-45c9-8186-70f03113a13f,552,2.0,2.0,2,1.0,34.0,2
3,2018-02-13,400879,4b38b11f-4e79-4241-804d-4e4bdc66e0c5,1766,230,d15fed64-2e99-4301-9be7-340bfd3155a6,578,1.0,1.0,1,1.0,11.0,2
4,2018-02-13,400883,06b926b3-fbce-485a-a5f2-4ec9a8192657,1727,254,6d49e5b9-dd79-4a34-8272-1bde95f288d7,530,1.0,1.0,1,2.0,41.0,4


## EDA

In [21]:
df = responses_summ_df.merge(
    students_df, how='left', on='user_uuid').merge(
    schools_details_df, how='left', on='master_school_id')

df['year'] = df.date.dt.year
print(df.shape)
df.head()

(335110, 26)


Unnamed: 0,date,question_id,user_uuid,template_id,section_id,activity_uuid,assignment_id,n_attempts,n_correct_responses,n_responses,...,country,province_state,district,city_town,quintile,urban_rural,sector,gis_longitude,gis_latitude,year
0,2018-02-12,400883,8f696f09-5c0e-4ac5-add6-a8f486724173,1727,254,205cf487-1810-4d0f-b96f-e5bcd2b798f2,530,1.0,1.0,1,...,South Africa,Gauteng,Johannesburg East,Johannesburg,5.0,Urban,Public,28.062774,-26.173322,2018
1,2018-02-12,400885,186bfae4-d8fd-47ba-b894-3200e8280dee,2647,228,cfd4b951-b4c6-48fc-a950-ad668d1398ab,552,1.0,1.0,1,...,South Africa,Gauteng,Johannesburg East,Johannesburg,,Urban,Independent,28.054526,-26.172531,2018
2,2018-02-12,400885,47e1ae12-77fd-4576-87a3-a7c108cfabd4,2647,228,bc6fe40a-59f8-45c9-8186-70f03113a13f,552,2.0,2.0,2,...,,,,,,,,,,2018
3,2018-02-13,400879,4b38b11f-4e79-4241-804d-4e4bdc66e0c5,1766,230,d15fed64-2e99-4301-9be7-340bfd3155a6,578,1.0,1.0,1,...,South Africa,Gauteng,Ekurhuleni North,Edenvale,,Urban,Independent,28.156765,-26.150073,2018
4,2018-02-13,400883,06b926b3-fbce-485a-a5f2-4ec9a8192657,1727,254,6d49e5b9-dd79-4a34-8272-1bde95f288d7,530,1.0,1.0,1,...,South Africa,Gauteng,Johannesburg East,Johannesburg,5.0,Urban,Public,28.062774,-26.173322,2018


In [30]:
country_dist = df.groupby('country')[['n_responses']].sum().sort_values('n_responses').reset_index()
country_dist

Unnamed: 0,country,n_responses
0,South Africa,463649


In [31]:
province_dist = df.groupby('province_state')[['n_responses']].sum().sort_values('n_responses').reset_index()
province_dist

Unnamed: 0,province_state,n_responses
0,Free State,1
1,Eastern Cape,987
2,KwaZulu-Natal,1264
3,Limpopo,2646
4,Western Cape,30114
5,Gauteng,428637


In [32]:
year_dist = df.groupby('year')[['n_responses']].sum().sort_values('n_responses').reset_index()
year_dist

Unnamed: 0,year,n_responses
0,2018,4497
1,2019,7279
2,2020,33402
3,2021,49480
4,2022,378285


In [33]:
assmt_dist = df.groupby('assignment_id')[['n_responses', 'question_id', 'year']].agg(
    {'n_responses':'sum', 'question_id':'nunique', 'year':'nunique'}).sort_values(
    'n_responses', ascending=False).reset_index()[:20]
assmt_dist

Unnamed: 0,assignment_id,n_responses,question_id,year
0,22620,378285,50,1
1,14140,3627,6,1
2,18608,2418,3,1
3,9541,2313,4,1
4,16942,1872,6,1
5,21144,1756,4,1
6,7681,1603,9,1
7,20856,1520,6,1
8,13490,1477,6,2
9,11984,1392,6,1


In [52]:
gauteng_df = df[df.province_state == 'Gauteng']
gauteng_dist = g_df.groupby('year')[['n_responses']].sum().sort_values('year').reset_index()
gauteng_dist

Unnamed: 0,year,n_responses
0,2018,2572
1,2019,3060
2,2020,16931
3,2021,28412
4,2022,377662


## Exporting the Data Resources for Power BI

In [61]:
provinces_considered = province_dist.province_state[province_dist.n_responses > 500].to_list()
provinces_considered

['Eastern Cape', 'KwaZulu-Natal', 'Limpopo', 'Western Cape', 'Gauteng']

In [62]:
responses_columns = responses_summ_df.columns.to_list()
final_responses = df[df.province_state.isin(provinces_considered) & (df.country == 'South Africa')]
final_responses = final_responses[responses_columns]
print(final_responses.shape)
final_responses.head()

(327993, 13)


Unnamed: 0,date,question_id,user_uuid,template_id,section_id,activity_uuid,assignment_id,n_attempts,n_correct_responses,n_responses,avg_marks,best_response_durations,difficulty
0,2018-02-12,400883,8f696f09-5c0e-4ac5-add6-a8f486724173,1727,254,205cf487-1810-4d0f-b96f-e5bcd2b798f2,530,1.0,1.0,1,2.0,16.0,4
1,2018-02-12,400885,186bfae4-d8fd-47ba-b894-3200e8280dee,2647,228,cfd4b951-b4c6-48fc-a950-ad668d1398ab,552,1.0,1.0,1,1.0,17.0,2
3,2018-02-13,400879,4b38b11f-4e79-4241-804d-4e4bdc66e0c5,1766,230,d15fed64-2e99-4301-9be7-340bfd3155a6,578,1.0,1.0,1,1.0,11.0,2
4,2018-02-13,400883,06b926b3-fbce-485a-a5f2-4ec9a8192657,1727,254,6d49e5b9-dd79-4a34-8272-1bde95f288d7,530,1.0,1.0,1,2.0,41.0,4
5,2018-02-13,400883,3e314321-50c6-4f01-98d3-5d29106ba42b,1727,254,feac96e6-7ee1-495c-902c-bfd30b40ebba,578,1.0,1.0,1,2.0,17.0,4


In [63]:
final_schools = schools_details_df[
    schools_details_df.province_state.isin(provinces_considered) & (schools_details_df.country == 'South Africa')]
print(final_schools.shape, schools_details_df.shape)
final_schools.head()

(2197, 12) (2200, 12)


Unnamed: 0,school_uuid,master_school_id,school,country,province_state,district,city_town,quintile,urban_rural,sector,gis_longitude,gis_latitude
0,002c567f-af10-4a61-bb2d-834cec705e73,20716.0,Eldomaine Secondary School (Johannesburg),South Africa,Gauteng,Johannesburg Central,Johannesburg,4.0,Urban,Public,27.907336,-26.295141
1,002d436b-7b3c-4a5d-a269-0b3f63582b92,387.0,Semphato Junior Secondary School (Pretoria),South Africa,Gauteng,Tshwane North,Pretoria,1.0,Rural,Public,28.088582,-25.456325
2,007f47e1-cb4e-4aef-83fc-16a8b81467ec,20810.0,Lekoa Shandu Secondary School (Vereeniging),South Africa,Gauteng,Sedibeng East,Vereeniging,3.0,Urban,Public,27.868147,-26.692765
5,00c15e79-c9d8-408c-8a21-73a6962f48c1,4186.0,Lucas Motshabanosi Secondary School (Pretoria),South Africa,Gauteng,Tshwane West,Pretoria,1.0,Urban,Public,28.075573,-25.485676
7,00f9268b-b396-498d-a393-c0c959ba71b2,20866.0,Mom Sebone Secondary School (Nigel),South Africa,Gauteng,Gauteng East,Nigel,4.0,Urban,Public,28.397135,-26.376486


In [64]:
student_columns = students_df.columns.to_list()
final_students = students_df.merge(final_schools, how='left', on='master_school_id')
final_students = final_students[
    final_students.province_state.isin(provinces_considered) & (final_students.country == 'South Africa')]
final_students = final_students[student_columns]
print(final_students.shape, students_df.shape)
final_students.head()

(211321, 2) (214053, 2)


Unnamed: 0,user_uuid,master_school_id
0,00002dda-9569-4382-a46e-42e48b38eb08,24890.0
1,00003e07-54aa-4639-982a-7e3ab01a0302,24921.0
2,00006806-d86d-4705-a4f1-1eca8427b4ce,20898.0
3,0000c915-683a-44c8-962f-beaf482a99ce,20922.0
4,0000d994-0085-4281-81d7-a9e6c442b716,20680.0


**Export the resources**

In [65]:
final_responses.to_csv('BI data/Responses.csv', index=False)
final_students.to_csv('BI data/Students.csv', index=False)
final_schools.to_csv('BI data/Schools.csv', index=False)
gauteng_grade_8_user_details.to_csv('BI data/Gauteng Grade Details.csv', index=False)
assmt_activities_df.to_csv('BI data/Activities.csv', index=False)
baseline_questions_df.to_csv('BI data/Questions.csv', index=False)
assignment_df.to_csv('BI data/Assignments.csv', index=False)