# Imports

We may have to run the following commands on Anaconda Prompt - after activating the DiscSim environment - to import some of the below packages

pip install matplotlib
pip install pandas
pip install tqdm
pip install scipy

In [1]:
# General modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from os.path import sep
from tqdm import tqdm

# Local modules
import disc_score
import binomial_confidence

# Enable re-load of local modules every time they are called
%load_ext autoreload
%autoreload 2
%aimport numpy 
%aimport pandas

# Load data

ERROR FLAG: I added the ee_data.csv to the DiscSim folder and committed it to my branch. But when I ran the read_csv command and used your home_folder and filename codes to import the csv, it kept giving a "File not found" error. Just so that I do not waste time, I imported the data using the local path to the file. TBD.

In [23]:
# Loading the subordinate dataset ee_data.csv
#home_folder = 'Documents{0}CEGIS{0}DiscSim'.format(sep)
sub_data = pd.read_csv(r"C:\Users\Cegis\Documents\ee_data.csv")
variables = sub_data.columns
n_variables = len(variables)

print('The following variables were found in the subordinate dataset:')
for v in variables:
    print('    {0}'.format(v))

The following variables were found in the subordinate dataset:
    identifier
    student_id
    Subject
    1st question Level
    1st level score
    2nd question Level
    2nd level score
    3rd question Level
    3rd level score
    Cluster ID
    School ID
    district
    Teacher User ID
    class
    baseline
    Student Identifier


In [24]:
# Renaming the variables 
sub_data.columns = ['ID', 'Student_ID', 'Subject', 'Sub_Q1_Level', 'Sub_Q1_Score', 'Sub_Q2_Level', 'Sub_Q2_Score', 'Sub_Q3_Level', 'Sub_Q3_Score', 'Cluster_ID', 'School_ID', 'District', 'Sub_ID', 'Stu_Class', 'Stu_Baseline_Level', 'Stu_Identifier']
variables = sub_data.columns
print("The new column names are as follows: \n")
for v in variables:
    print('    {0}'.format(v))

The new column names are as follows: 

    ID
    Student_ID
    Subject
    Sub_Q1_Level
    Sub_Q1_Score
    Sub_Q2_Level
    Sub_Q2_Score
    Sub_Q3_Level
    Sub_Q3_Score
    Cluster_ID
    School_ID
    District
    Sub_ID
    Stu_Class
    Stu_Baseline_Level
    Stu_Identifier


In [25]:
# Loading the supervisor dataset ee_ees_data.csv
sup_data = pd.read_csv(r"C:\Users\Cegis\Documents\ee_ees_data.csv")
variables = sup_data.columns
n_variables = len(variables)

print('The following variables were found in the supervisor dataset:')
for v in variables:
    print('    {0}'.format(v))

The following variables were found in the supervisor dataset:
    UID
    district
    block id
    cluster id
    Student Identifier
    school id
    subject
    class
    baseline
    1st question Level
    1st level score
    2nd question Level
    2nd level score
    3rd question Level
    3rd level score
    Admin User ID


In [26]:
# Renaming the variables 
sup_data.columns = ['ID', 'District', 'Block_ID', 'Cluster_ID', 'Stu_Identifier', 'School_ID', 'Subject', 'Stu_Class','Stu_Baseline_Level', 'Sup_Q1_Level', 'Sup_Q1_Score', 'Sup_Q2_Level', 'Sup_Q2_Score', 'Sup_Q3_Level', 'Sup_Q3_Score', 'Sup_ID']
variables = sup_data.columns
print("The new column names are as follows: \n")
for v in variables:
    print('    {0}'.format(v))

The new column names are as follows: 

    ID
    District
    Block_ID
    Cluster_ID
    Stu_Identifier
    School_ID
    Subject
    Stu_Class
    Stu_Baseline_Level
    Sup_Q1_Level
    Sup_Q1_Score
    Sup_Q2_Level
    Sup_Q2_Score
    Sup_Q3_Level
    Sup_Q3_Score
    Sup_ID


# Merging the subordinate and supervisor datasets

In [27]:
print("Number of observations in supervisor dataset: ")
print(len(sup_data))
print("\n")
print("Number of observations in subordinate dataset: ")
print(len(sub_data))


Number of observations in supervisor dataset: 
13577


Number of observations in subordinate dataset: 
166850


In [28]:
# Performing an inner join to merge the subordinate 
data = pd.merge(sup_data, sub_data, on='ID')
print("The supervisor and subordinate datasets have been merged. \n")
print("Number of observations in the merged dataset: ")
print(len(data))
print("\n")
print("The variables in the merged dataset are as follows: ")
variables = data.columns
for v in variables:
    print('    {0}'.format(v))
    

The supervisor and subordinate datasets have been merged. 

Number of observations in the merged dataset: 
13577


The variables in the merged dataset are as follows: 
    ID
    District_x
    Block_ID
    Cluster_ID_x
    Stu_Identifier_x
    School_ID_x
    Subject_x
    Stu_Class_x
    Stu_Baseline_Level_x
    Sup_Q1_Level
    Sup_Q1_Score
    Sup_Q2_Level
    Sup_Q2_Score
    Sup_Q3_Level
    Sup_Q3_Score
    Sup_ID
    Student_ID
    Subject_y
    Sub_Q1_Level
    Sub_Q1_Score
    Sub_Q2_Level
    Sub_Q2_Score
    Sub_Q3_Level
    Sub_Q3_Score
    Cluster_ID_y
    School_ID_y
    District_y
    Sub_ID
    Stu_Class_y
    Stu_Baseline_Level_y
    Stu_Identifier_y


In [29]:
# Dropping unnecessary rows from the merged dataset
data.pop('Cluster_ID_y')
data.pop('School_ID_y')
data.pop('District_y')
data.pop('Stu_Class_y')
data.pop('Stu_Baseline_Level_y')
data.pop('Stu_Identifier_y')
data.pop('Subject_y')

# Renaming variables in the merged dataset
data.rename(columns = {'District_x':'District'}, inplace = True)
data.rename(columns = {'Cluster_ID_x':'Cluster_ID'}, inplace = True)
data.rename(columns = {'Stu_Identifier_x':'Stu_Identifier'}, inplace = True)
data.rename(columns = {'School_ID_x':'School_ID'}, inplace = True)
data.rename(columns = {'Subject_x':'Subject'}, inplace = True)
data.rename(columns = {'Stu_Class_x':'Stu_Class'}, inplace = True)
data.rename(columns = {'Stu_Baseline_Level_x':'Stu_Baseline_Level'}, inplace = True)

variables = data.columns
for v in variables:
    print('    {0}'.format(v))

    ID
    District
    Block_ID
    Cluster_ID
    Stu_Identifier
    School_ID
    Subject
    Stu_Class
    Stu_Baseline_Level
    Sup_Q1_Level
    Sup_Q1_Score
    Sup_Q2_Level
    Sup_Q2_Score
    Sup_Q3_Level
    Sup_Q3_Score
    Sup_ID
    Student_ID
    Sub_Q1_Level
    Sub_Q1_Score
    Sub_Q2_Level
    Sub_Q2_Score
    Sub_Q3_Level
    Sub_Q3_Score
    Sub_ID


In [48]:
print("The merged dataset looks like this: \n \n")
print(data.head(4))

# Exporting the merged dataset
export_path = r'C:\Users\Cegis\Documents\discsim_edu_L0_L1_merged.csv'
data.to_csv(export_path, index = False)

The merged dataset looks like this: 
 

            ID      District  Block_ID   Cluster_ID  Stu_Identifier  \
0  022724217_1  CHENGALPATTU         5  33030600402        22724217   
1  022724217_3  CHENGALPATTU         5  33030600402        22724217   
2  022860415_1  CHENGALPATTU         7  33030804503        22860415   
3  023043741_2  CHENGALPATTU         9  33031203303        23043741   

     School_ID      Subject  Stu_Class Stu_Baseline_Level Sup_Q1_Level  ...  \
0  33030600201      English          3              Malar        Malar  ...   
1  33030600201        Tamil          3              Malar        Malar  ...   
2  33030806401      English          3              Malar        Malar  ...   
3  33031203501  Mathematics          3              Malar        Malar  ...   

   Student_ID Sub_Q1_Level  Sub_Q1_Score Sub_Q2_Level  Sub_Q2_Score  \
0  2022724217        Malar            10          NaN           NaN   
1  2022724217        Malar            10          NaN           Na

# Details about the data

Passing marks: Mathematics 9 English 8 Tamil 8

Levels: 1 Arumbu 2 Mottu 3 Malar

Class 1 students can only be tested for Arumbu level
Class 2 students can only be tested for Arumbu and Mottu levels
Class 3 students can be tested for Arumbu, Mottu, and Malar levels

Refer to TN Education Project Analysis Summary document for more details on levels, grading, assessment, and discrepancy analysis methodology

# Function to determine endline level assigned by the subordinate (L0)

In [61]:
def endline_level_sub(df):
    df['Stu_Endline_Level_Sub'] = ''
    
    # Below Arumbu
    # For students in class 1 Arumbu, class 2 Arumbu, and class 3 Arumbu
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] < 9),'Stu_Endline_Level_Sub']='Below Arumbu'
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Below Arumbu'
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Below Arumbu'
            
    # Arumbu
    
    # For students in class 1 Arumbu
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Arumbu'
    
    # For students in class 2 Arumbu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] < 9),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    
    # For students in class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] < 9),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    
    # For students in class 2 Mottu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] < 9),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    
    # For students in class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] < 9),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Arumbu'
    
    # Mottu
    
    # For students in Class 2 Arumbu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] >= 9),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score']>= 8) & (df['Sub_Q2_Score'] >= 8),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8),'Stu_Endline_Level_Sub']='Mottu'
    
    # For students in Class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] >= 9) & (df['Sub_Q3_Score'] < 9),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8) & (df['Sub_Q3_Score'] < 8),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8) & (df['Sub_Q3_Score'] < 8),'Stu_Endline_Level_Sub']='Mottu'

    # For students in class 2 Mottu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Mottu'
    
    # For students in class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score']<9),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score']<8),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score']<8),'Stu_Endline_Level_Sub']='Mottu'
    
    # For students in class 3 Malar
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] < 9),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='English') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] < 8),'Stu_Endline_Level_Sub']='Mottu'
    
    # Malar
    
    # For students in Class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] >= 9) & (df['Sub_Q3_Score'] >= 9),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8) & (df['Sub_Q3_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8) & (df['Sub_Q3_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    
    # For students in Class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9) & (df['Sub_Q2_Score'] >= 9),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8) & (df['Sub_Q2_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    
    # For students in Class 3 Malar
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Mathematics') & (df['Sub_Q1_Score'] >= 9),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='English') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Tamil') & (df['Sub_Q1_Score'] >= 8),'Stu_Endline_Level_Sub']='Malar'
    
    return df

# Function to determine endline level assigned by the supervisor (L1)

In [60]:
def endline_level_sup(df):
    df['Stu_Endline_Level_Sup'] = ''
    
    # Below Arumbu
    # For students in class 1 Arumbu, class 2 Arumbu, and class 3 Arumbu
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] < 9),'Stu_Endline_Level_Sup']='Below Arumbu'
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Below Arumbu'
    df.loc[(df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Below Arumbu'
            
    # Arumbu
    
    # For students in class 1 Arumbu
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==1) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Arumbu'
    
    # For students in class 2 Arumbu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] < 9),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    
    # For students in class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] < 9),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    
    # For students in class 2 Mottu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] < 9),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    
    # For students in class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] < 9),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Arumbu'
    
    # Mottu
    
    # For students in Class 2 Arumbu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] >= 9),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8),'Stu_Endline_Level_Sup']='Mottu'
    
    # For students in Class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] >= 9) & (df['Sup_Q3_Score'] < 9),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8) & (df['Sup_Q3_Score'] < 8),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8) & (df['Sup_Q3_Score'] < 8),'Stu_Endline_Level_Sup']='Mottu'

    # For students in class 2 Mottu
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==2) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Mottu'
    
    # For students in class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score']<9),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score']<8),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score']<8),'Stu_Endline_Level_Sup']='Mottu'
    
    # For students in class 3 Malar
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] < 9),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='English') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Mottu'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] < 8),'Stu_Endline_Level_Sup']='Mottu'
    
    # Malar
    
    # For students in Class 3 Arumbu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] >= 9) & (df['Sup_Q3_Score'] >= 9),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8) & (df['Sup_Q3_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Arumbu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8) & (df['Sup_Q3_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    
    # For students in Class 3 Mottu
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9) & (df['Sup_Q2_Score'] >= 9),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Mottu') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8) & (df['Sup_Q2_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    
    # For students in Class 3 Malar
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Mathematics') & (df['Sup_Q1_Score'] >= 9),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='English') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    df.loc[(df['Stu_Class']==3) & (df['Stu_Baseline_Level']=='Malar') & (df['Subject']=='Tamil') & (df['Sup_Q1_Score'] >= 8),'Stu_Endline_Level_Sup']='Malar'
    
    return df

In [62]:
# Checking the data type of each column in the data
print("The data types of each column of data are as follows: \n")
print(data.dtypes)

The data types of each column of data are as follows: 

ID                        object
District                  object
Block_ID                   int64
Cluster_ID                 int64
Stu_Identifier             int64
School_ID                  int64
Subject                   object
Stu_Class                  int64
Stu_Baseline_Level        object
Sup_Q1_Level              object
Sup_Q1_Score               int64
Sup_Q2_Level              object
Sup_Q2_Score             float64
Sup_Q3_Level              object
Sup_Q3_Score             float64
Sup_ID                     int64
Student_ID                 int64
Sub_Q1_Level              object
Sub_Q1_Score               int64
Sub_Q2_Level              object
Sub_Q2_Score             float64
Sub_Q3_Level              object
Sub_Q3_Score             float64
Sub_ID                     int64
Stu_Endline_Level_Sub     object
Stu_Endline_Level_Sup     object
dtype: object


In [63]:
# Calling the function to determine the student endline level assigned by the subordinate (L0)
data = endline_level_sub(data)
# Calling the function to determine the student endline level assigned by the supervisor (L1)
data = endline_level_sup(data)

In [64]:
print("This is how the dataset looks after determining the student endline assigned by the subordinate (L0) and the supervisor (L1) \n")
print(data.head(10))

This is how the dataset looks after determining the student endline assigned by the subordinate (L0) and the supervisor (L1) 

            ID      District  Block_ID   Cluster_ID  Stu_Identifier  \
0  022724217_1  CHENGALPATTU         5  33030600402        22724217   
1  022724217_3  CHENGALPATTU         5  33030600402        22724217   
2  022860415_1  CHENGALPATTU         7  33030804503        22860415   
3  023043741_2  CHENGALPATTU         9  33031203303        23043741   
4  023076795_1  CHENGALPATTU         9  33031204206        23076795   
5  023149328_3  CHENGALPATTU         7  33030808705        23149328   
6  023240387_1  CHENGALPATTU        10  33031300702        23240387   
7  023240387_3  CHENGALPATTU        10  33031300702        23240387   
8  023278930_3  CHENGALPATTU         7  33030803101        23278930   
9  023280759_1  CHENGALPATTU         8  33031004102        23280759   

     School_ID      Subject  Stu_Class Stu_Baseline_Level Sup_Q1_Level  ...  \
0  330306002

# Percentage of observations with mismatch of endline assessment between the subordinate (L0) and supervisor (L1)

In [67]:
# Function to create a dummy which takes a value of 1 for all observations where there is a mismatch between the endline level assigned by the subordinate (L0) and the supervisor (L1)
def endline_mismatch(df):
    df['endline_mismatch']= 1
    df.loc[(df['Stu_Endline_Level_Sub']==df['Stu_Endline_Level_Sup']),'endline_mismatch']=0
    return df

In [92]:
# Calling the endline mismatch function
data = endline_mismatch(data)
# Adding a column which has a value of 1 in each row
data['observation_count']=1
print(data.head(10))

            ID      District  Block_ID   Cluster_ID  Stu_Identifier  \
0  022724217_1  CHENGALPATTU         5  33030600402        22724217   
1  022724217_3  CHENGALPATTU         5  33030600402        22724217   
2  022860415_1  CHENGALPATTU         7  33030804503        22860415   
3  023043741_2  CHENGALPATTU         9  33031203303        23043741   
4  023076795_1  CHENGALPATTU         9  33031204206        23076795   
5  023149328_3  CHENGALPATTU         7  33030808705        23149328   
6  023240387_1  CHENGALPATTU        10  33031300702        23240387   
7  023240387_3  CHENGALPATTU        10  33031300702        23240387   
8  023278930_3  CHENGALPATTU         7  33030803101        23278930   
9  023280759_1  CHENGALPATTU         8  33031004102        23280759   

     School_ID      Subject  Stu_Class Stu_Baseline_Level Sup_Q1_Level  ...  \
0  33030600201      English          3              Malar        Malar  ...   
1  33030600201        Tamil          3              Malar   

In [103]:
# Defining a function to generate the endline mismatch table
def endline_mismatch_table(df):
    result = df.groupby('Subject').agg({'endline_mismatch':'sum','observation_count':'sum'}).reset_index()
    result['percentage_mismatch']=result['endline_mismatch']/result['observation_count']
    result.columns = ['Subject','# Mismatch','Total observations','% Mismatch']
    return result

In [104]:
# Calling the function to generate endline mismatch table by subject
endline_mismatch_by_subject = endline_mismatch_table(data)
print(endline_mismatch_by_subject)

       Subject  # Mismatch  Total observations  % Mismatch
0      English        1940                4480    0.433036
1  Mathematics        2186                4208    0.519487
2        Tamil        1142                4889    0.233586
