***
# ICAD Data Management
***

In [1]:
#Import libraries
import os, sys, json
import pandas as pd
import numpy as np
import toml

In [2]:
#Set Pandas options
pd.set_option('display.max_columns', None)

In [3]:
wdir = os.getcwd()
wdir

'/Users/akshayranade/Documents/Personal/icad_data_management'

In [4]:
#Change directory to database operations to load all the methods
#Import all methods from database_operations
from database_operations.create_database_function import create_postgres_db
from database_operations.create_df_from_sql_function import create_df_from_sql
from database_operations.create_table_from_pandas_df_function import create_table_from_df

from data_cleaning.clean_column_names import clean_column_names
from data_cleaning.clean_strings import clean_string_data

In [5]:
# Load config
config = toml.load(wdir + "/config.toml")
print (config)

{'postgres_credentials': {'username': 'postgres', 'password': '', 'host': '127.0.0.1', 'port': '5432'}, 'db_details': {'dbname': 'icad_db', 'school_master': 'school_master', 'all_students': 'all_students', 'compitative_exams': 'compitative_exams', 'cpa_results': 'cpa_results', 'exam_marks': 'exam_marks', 'admissions': 'admissions'}, 'file_locations': {'all_schools': '/Users/akshayranade/Documents/Personal/icad_data_management/data/sales/activity_report.csv'}}


In [6]:
#Create database in postgres called icad_db
create_postgres_db(config['postgres_credentials'], config['db_details']['dbname'])

Database already exists:  icad_db
PostgreSQL connection is closed



***
## Load all school data in a table
***

In [7]:
#Read all schools data from csv
all_schools_df = pd.read_csv(config["file_locations"]["all_schools"])

In [8]:
all_schools_df.head(10)

Unnamed: 0,institute_name,address1,address2,city,district,state,board,contact_no,email_address,category,branch_name
0,ABC Convent and High School,Manewada,Manewada,Nagpur,Nagpur,Maharashtra,State,9096300320.0,,B,EAST CENTRE - NAGPUR
1,Adarsh Sanskar Vidyalaya Hasanbagh,Hasanbagh,Hasanbagh,Nagpur,Nagpur,Maharashtra,State,,,B,EAST CENTRE - NAGPUR
2,Adarsh Sanskar Vidyalaya CBSE Hudkeshwar,Hudkeshwar,Hudkeshwar,Nagpur,Nagpur,Maharashtra,State,7620264605.0,,B,EAST CENTRE - NAGPUR
3,Adarsh Vidya Mandir,CA Road,CA Road,Nagpur,Nagpur,Maharashtra,State,,,B,EAST CENTRE - NAGPUR
4,Adarsh Vidyalaya,Umrer,Umrer,Umred,Nagpur,Maharashtra,State,,,B,EAST CENTRE - NAGPUR
5,Amit High School,Dighori,Dighori,Nagpur,Nagpur,Maharashtra,State,,,C,EAST CENTRE - NAGPUR
6,Ankush Classes,Near dada Saheb thakre High School ; Aashirwaa...,Aashirwaad Nagar,Nagpur,Nagpur,Maharashtra,State,9881015403.0,noid@emailid.com,B,EAST CENTRE - NAGPUR
7,Ashok Kanya Vidyalaya,Umrer,Umrer,Umred,Nagpur,Maharashtra,State,,,C,EAST CENTRE - NAGPUR
8,Ashok Vidyalaya Boys,Umrer,Umrer,Umred,Nagpur,Maharashtra,State,,,C,EAST CENTRE - NAGPUR
9,Baba Nanak Sidhi Hindi High School,Nandanwan,Nandanwan,Nagpur,Nagpur,Maharashtra,State,7122764288.0,babananaksindhihindihs@gmail.com,C,EAST CENTRE - NAGPUR


In [9]:
all_schools_df.dtypes

institute_name    object
 address1         object
 address2         object
 city             object
 district         object
 state            object
 board            object
 contact_no       object
 email_address    object
category          object
branch_name       object
dtype: object

In [10]:
all_schools_df.columns

Index(['institute_name', ' address1', ' address2', ' city', ' district',
       ' state', ' board', ' contact_no', ' email_address', 'category',
       'branch_name'],
      dtype='object')

In [11]:
#Clean column names
all_schools_df = clean_column_names(all_schools_df)

In [12]:
all_schools_df.columns

Index(['institute_name', 'address1', 'address2', 'city', 'district', 'state',
       'board', 'contact_no', 'email_address', 'category', 'branch_name'],
      dtype='object')

In [13]:
#Clean school names, addresses
all_schools_df['institute_name'] = all_schools_df['institute_name'].apply(lambda x: clean_string_data(x))
all_schools_df['address1'] = all_schools_df['address1'].apply(lambda x: clean_string_data(x))
all_schools_df['address2'] = all_schools_df['address2'].apply(lambda x: clean_string_data(x))
all_schools_df['address2'] = all_schools_df['address2'].apply(lambda x: clean_string_data(x))
all_schools_df['city'] = all_schools_df['city'].apply(lambda x: clean_string_data(x))
all_schools_df['state'] = all_schools_df['state'].apply(lambda x: clean_string_data(x))

In [14]:
all_schools_df.head()

Unnamed: 0,institute_name,address1,address2,city,district,state,board,contact_no,email_address,category,branch_name
0,ABC CONVENT AND HIGH SCHOOL,MANEWADA,MANEWADA,NAGPUR,Nagpur,MAHARASHTRA,State,9096300320.0,,B,EAST CENTRE - NAGPUR
1,ADARSH SANSKAR VIDYALAYA HASANBAGH,HASANBAGH,HASANBAGH,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR
2,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR,HUDKESHWAR,HUDKESHWAR,NAGPUR,Nagpur,MAHARASHTRA,State,7620264605.0,,B,EAST CENTRE - NAGPUR
3,ADARSH VIDYA MANDIR,CA ROAD,CA ROAD,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR
4,ADARSH VIDYALAYA,UMRER,UMRER,UMRED,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR


In [16]:
#Create a table for all schools using a generic function
create_table_from_df(config['postgres_credentials'], config['db_details']['dbname'], config['db_details']['school_master'], all_schools_df)

Table successfully created: school_master


***
## Read all students data from various sheets, collate it to create a student master table

1. Student master table will have student name, school name and a unique student id
2. Even if same student appears in different sheets, their information will be captured only once in the student_master table
3. School names to be standardised based on the school_master table
***

## 1. LEAP data

In [32]:
# Load the Leap data
leap_data_df = pd.read_excel('/Users/akshayranade/Documents/Personal/icad_data_management/data/sales/1. 8, 9, 10, 11 Moving All Data.xlsx', '7,8,9,10- Leap 2023-24')

In [33]:
leap_data_df.head()

Unnamed: 0,Roll No,Student Name,City,Mobile 1,Mobile 2,School,Class
0,121574,Sidhant Bobade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th
1,117785,Parth Bonde,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th
2,122267,Ninad Matkar,Amravati Centre,,,"Orchid City International School, Amravati",7th
3,121575,Rachit Ravi Tayade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th
4,117780,Pranjal P Kantute,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th


In [34]:
#Clean column names
leap_data_df = clean_column_names(leap_data_df)

In [35]:
#Clean student names, school names, class
leap_data_df['student_name'] = leap_data_df['student_name'].apply(lambda x: clean_string_data(x))
leap_data_df['city'] = leap_data_df['city'].apply(lambda x: clean_string_data(x))
leap_data_df['school'] = leap_data_df['school'].apply(lambda x: clean_string_data(x))
leap_data_df['class'] = leap_data_df['class'].apply(lambda x: str(x).replace('th', '')).astype(int)

In [43]:
leap_data_df.head(truncate=False)

TypeError: NDFrame.head() got an unexpected keyword argument 'truncate'

In [30]:
leap_data_df.value_counts('class')

class
10    20317
9      7330
8      4503
7      4034
Name: count, dtype: int64

In [31]:
leap_data_df.shape[0]

36184

In [38]:
school_master = create_df_from_sql(connection_config=config['postgres_credentials'], dbname='icad_db', tbname='school_master')

In [39]:
school_master.head()

Unnamed: 0,institute_name,address1,address2,city,district,state,board,contact_no,email_address,category,branch_name
0,ABC CONVENT AND HIGH SCHOOL,MANEWADA,MANEWADA,NAGPUR,Nagpur,MAHARASHTRA,State,9096300320.0,,B,EAST CENTRE - NAGPUR
1,ADARSH SANSKAR VIDYALAYA HASANBAGH,HASANBAGH,HASANBAGH,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR
2,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR,HUDKESHWAR,HUDKESHWAR,NAGPUR,Nagpur,MAHARASHTRA,State,7620264605.0,,B,EAST CENTRE - NAGPUR
3,ADARSH VIDYA MANDIR,CA ROAD,CA ROAD,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR
4,ADARSH VIDYALAYA,UMRER,UMRER,UMRED,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR


In [41]:
school_master['school_name_add1'] = school_master['institute_name'] + ' ' + school_master['address1']
school_master['school_name_add2'] = school_master['institute_name'] + ' ' + school_master['address2']
school_master['school_name_add1_city'] = school_master['institute_name'] + ' ' + school_master['address1'] + ' ' + school_master['city']
school_master['school_name_add2_city'] = school_master['institute_name'] + ' ' + school_master['address2'] + ' ' + school_master['city']
school_master['school_name_full_add'] = school_master['institute_name'] + ' ' + school_master['address1'] + ' ' + school_master['address2'] + ' ' + school_master['city'] 

In [42]:
school_master.head()

Unnamed: 0,institute_name,address1,address2,city,district,state,board,contact_no,email_address,category,branch_name,school_name_add1,school_name_add2,school_name_add1_city,school_name_add2_city,school_name_full_add
0,ABC CONVENT AND HIGH SCHOOL,MANEWADA,MANEWADA,NAGPUR,Nagpur,MAHARASHTRA,State,9096300320.0,,B,EAST CENTRE - NAGPUR,ABC CONVENT AND HIGH SCHOOL MANEWADA,ABC CONVENT AND HIGH SCHOOL MANEWADA,ABC CONVENT AND HIGH SCHOOL MANEWADA NAGPUR,ABC CONVENT AND HIGH SCHOOL MANEWADA NAGPUR,ABC CONVENT AND HIGH SCHOOL MANEWADA MANEWADA ...
1,ADARSH SANSKAR VIDYALAYA HASANBAGH,HASANBAGH,HASANBAGH,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR,ADARSH SANSKAR VIDYALAYA HASANBAGH HASANBAGH,ADARSH SANSKAR VIDYALAYA HASANBAGH HASANBAGH,ADARSH SANSKAR VIDYALAYA HASANBAGH HASANBAGH N...,ADARSH SANSKAR VIDYALAYA HASANBAGH HASANBAGH N...,ADARSH SANSKAR VIDYALAYA HASANBAGH HASANBAGH H...
2,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR,HUDKESHWAR,HUDKESHWAR,NAGPUR,Nagpur,MAHARASHTRA,State,7620264605.0,,B,EAST CENTRE - NAGPUR,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR HUDKE...,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR HUDKE...,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR HUDKE...,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR HUDKE...,ADARSH SANSKAR VIDYALAYA CBSE HUDKESHWAR HUDKE...
3,ADARSH VIDYA MANDIR,CA ROAD,CA ROAD,NAGPUR,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR,ADARSH VIDYA MANDIR CA ROAD,ADARSH VIDYA MANDIR CA ROAD,ADARSH VIDYA MANDIR CA ROAD NAGPUR,ADARSH VIDYA MANDIR CA ROAD NAGPUR,ADARSH VIDYA MANDIR CA ROAD CA ROAD NAGPUR
4,ADARSH VIDYALAYA,UMRER,UMRER,UMRED,Nagpur,MAHARASHTRA,State,,,B,EAST CENTRE - NAGPUR,ADARSH VIDYALAYA UMRER,ADARSH VIDYALAYA UMRER,ADARSH VIDYALAYA UMRER UMRED,ADARSH VIDYALAYA UMRER UMRED,ADARSH VIDYALAYA UMRER UMRER UMRED


In [None]:
#Match a string with a school name


In [None]:
# A function to standardise the school name with reference to the schoool_master table
def standard_school_name(school_name_str) -> str:
    #Read school_master data
    school_master = create_df_from_sql(connection_config=config['postgres_credentials'], dbname='icad_db', tbname='school_master')
     

In [107]:
#Split the school name and address
leap_data_df['school'] = leap_data_df['school'].str.strip()

In [112]:
leap_data_df.head()

Unnamed: 0,roll_no,student_name,city,mobile_1,mobile_2,school,class
0,121574,Sidhant Bobade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th
1,117785,Parth Bonde,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th
2,122267,Ninad Matkar,Amravati Centre,,,"Orchid City International School, Amravati",7th
3,121575,Rachit Ravi Tayade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th
4,117780,Pranjal P Kantute,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th


In [122]:
leap_data_df['school_name'] = leap_data_df['school'].apply(lambda x: str(x).split(',')[0] if len(str(x).split(',')) >= 1 else None)

In [124]:
leap_data_df['school_address'] = leap_data_df['school'].apply(lambda x: [' ' + str(x).split(',')[i] for i in (1, len(str(x).split(',')))] if len(str(x).split(',')) > 1 else None)

IndexError: list index out of range

In [115]:
leap_data_df.head()

Unnamed: 0,roll_no,student_name,city,mobile_1,mobile_2,school,class,school_name
0,121574,Sidhant Bobade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th,Dnyanmata High School Camp road
1,117785,Parth Bonde,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th,K.K. Cambridge School
2,122267,Ninad Matkar,Amravati Centre,,,"Orchid City International School, Amravati",7th,Orchid City International School
3,121575,Rachit Ravi Tayade,Amravati Centre,,,"Dnyanmata High School Camp road, near IMA hall...",7th,Dnyanmata High School Camp road
4,117780,Pranjal P Kantute,Amravati Centre,,,"K.K. Cambridge School, Amravati",7th,K.K. Cambridge School


In [102]:
leap_data_df[leap_data_df['school_name'] == 'None'].head(40)

Unnamed: 0,roll_no,student_name,city,mobile_1,mobile_2,school,class,school_name,school_split
