In [1]:
import pandas as pd
import numpy as np
import re
import sqlite3

from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, String, DateTime, Boolean

In [2]:
data_dir = Path("Assignment 1 Data Set/Karachi/")

# Student DB

In [3]:
STUDENT_COLUMN_NAMES = ['SID', 'Name', 'Father', 'DoB', 'Gender', 'Reg Date', 'Reg Status', 'Degree Status', 'Address', 'Qualification']

In [4]:
std_col_types = {
    "SID": Text,
    "Name": Text,
    "Father": Text,
    "DoB":  Text,
    "Gender": String(1),
    "Reg Date": DateTime,
    "Reg Status": String(1),
    "Degree Status": String(1),
    "Address": Text,
    "Qualification": Text,
    "Degree": String(2)
    }

In [5]:
student_file = data_dir / "Student.xls"

## BS Level

In [6]:
df_bs_std = pd.read_excel(student_file, sheet_name="BS")
df_bs_std.columns = STUDENT_COLUMN_NAMES

In [7]:
df_bs_std.SID = "KHR_BS_" + df_bs_std.SID.astype(str)

In [8]:
df_bs_std["Degree"] = "BS"

In [9]:
df_bs_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,KHR_BS_0,Hussain Ansary,Ubaid Ansary,1974-01-05 00:00:00,M,1994-08-13,A,C,h# 978 Street No.72 Defense Phase 1 KHR,A-Level,BS
1,KHR_BS_1,Shk. Munir Hussaini,Viqar Hamid Hussaini,1974-12-13 00:00:00,M,1994-08-13,A,C,H# 169 s# 0 Mutian wala Thata,A-Level,BS
2,KHR_BS_2,Moeena Momd. Uhad Haque,Momd. Uhad Haque,1976-09-13 00:00:00,F,1994-08-17,A,C,ho. # 484 street # 95 Lalu khait khr,Fsc,BS
3,KHR_BS_3,Zeeshan Mustamsir,Mustamsir Danish,1976-09-15 00:00:00,M,1994-08-20,A,C,House No. 31 street # 91 Dak ghar Sadiqabad,F.Sc.,BS
4,KHR_BS_4,Sheik Ghazi Haque,Anayyat Haque,1976-06-16 00:00:00,M,1994-08-23,A,C,Ho. # 345 S# 86 khr,Fsc,BS


In [10]:
df_bs_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,6601,6601,6601,6601,6601,6601,6601,6601,6601,6601,6601
unique,6601,6396,6398,3622,2,320,2,2,6601,6,1
top,KHR_BS_5083,Daud Kiyyani,Mustamsir Awan,1982-03-04 00:00:00,M,1994-08-13 00:00:00,A,C,Ho. no. 38 S No. 64 Haider road Multan,A-Level,BS
freq,1,4,2,5,4403,23,6600,4800,1,1103,6601
first,,,,,,1994-08-01 00:00:00,,,,,
last,,,,,,2005-01-01 00:00:00,,,,,


## MS Level

In [11]:
df_ms_std = pd.read_excel(student_file, sheet_name="MS")
df_ms_std.columns = STUDENT_COLUMN_NAMES

In [12]:
df_ms_std.SID = "KHR_MS_" + df_ms_std.SID.astype(str)

In [13]:
df_ms_std["Degree"] = "MS"

In [14]:
df_ms_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,KHR_MS_0,Rukshanda Ansari,Jan Wajid Ansari,1979-06-16 00:00:00,F,2001-08-16,A,C,Ho. # 910 S# 79 Mandi Sibbi,MSc,MS
1,KHR_MS_1,Bulbul Haqqie,Aga Salman Haqqie,1977-12-13 00:00:00,M,2001-08-13,A,C,H# 169 s# 0 Defense Phase IV KHR,M.Phil,MS
2,KHR_MS_2,Shamim Bakht,Bakht Raza,1979-09-13 00:00:00,F,2001-08-17,A,C,ho. # 484 street # 95 Buliyan wali Usman Kot,MSc,MS
3,KHR_MS_3,Mohammad Yawar Haqqie,Mumtaz Haqqie,1979-09-15 00:00:00,M,2001-08-20,A,C,House No. 31 street # 91 Sultan coloney Karachi,BS,MS
4,KHR_MS_4,Muneer Kishwar Baig,Aun Haroon Baig,1979-06-16 00:00:00,M,2001-08-23,A,C,Ho. # 345 S# 86 Cement factory coloney Chishtian,MSc,MS


In [15]:
df_ms_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600
unique,1600,1583,1575,1102,2,116,1,2,1600,6,1
top,KHR_MS_1008,Khurshid Usmani,Arif Jaffery,1979-10-07 00:00:00,M,2003-08-16 00:00:00,A,C,h no.270 s no.39 Thata,BS,MS
freq,1,2,2,4,1068,16,1600,1200,1,268,1600
first,,,,,,2001-08-01 00:00:00,,,,,
last,,,,,,2004-08-29 00:00:00,,,,,


In [24]:
df_std = pd.concat([df_bs_std, df_ms_std], ignore_index=True)

In [25]:
# df_std.Gender = df_std.Gender.map({'M': False, 'F': True})

In [26]:
df_std["Reg Date"] = pd.to_datetime(df_std["Reg Date"])

In [27]:
df_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,KHR_BS_0,Hussain Ansary,Ubaid Ansary,1974-01-05 00:00:00,M,1994-08-13,A,C,h# 978 Street No.72 Defense Phase 1 KHR,A-Level,BS
1,KHR_BS_1,Shk. Munir Hussaini,Viqar Hamid Hussaini,1974-12-13 00:00:00,M,1994-08-13,A,C,H# 169 s# 0 Mutian wala Thata,A-Level,BS
2,KHR_BS_2,Moeena Momd. Uhad Haque,Momd. Uhad Haque,1976-09-13 00:00:00,F,1994-08-17,A,C,ho. # 484 street # 95 Lalu khait khr,Fsc,BS
3,KHR_BS_3,Zeeshan Mustamsir,Mustamsir Danish,1976-09-15 00:00:00,M,1994-08-20,A,C,House No. 31 street # 91 Dak ghar Sadiqabad,F.Sc.,BS
4,KHR_BS_4,Sheik Ghazi Haque,Anayyat Haque,1976-06-16 00:00:00,M,1994-08-23,A,C,Ho. # 345 S# 86 khr,Fsc,BS


In [28]:
df_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,8201,8201,8201,8201,8201,8201,8201,8201,8201,8201,8201
unique,8201,7043,7041,3742,2,320,2,2,7418,12,2
top,KHR_BS_5083,Daud Kiyyani,Umar Pasha,1979-03-10 00:00:00,M,2003-08-16 00:00:00,A,C,house # 467 Street # 30 Taj abad Thata,A-Level,BS
freq,1,5,4,7,5471,38,8200,6000,2,1103,6601
first,,,,,,1994-08-01 00:00:00,,,,,
last,,,,,,2005-01-01 00:00:00,,,,,


In [29]:
dob = pd.to_datetime(df_std.DoB, errors="coerce")

In [30]:
df_std[dob.isnull()]

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
290,KHR_BS_290,Qasim Rehan,Rehan Nabeel,29/Feb/75,M,1994-08-29,A,C,Ho. No. 193 St. No. 14 Chand mari Mirpur Khas,A level,BS
1551,KHR_BS_2151,Affan Baig,Mohammad Sheharyar Baig,29/Feb/79,M,1997-08-16,A,C,house no.568 St. No.52 Zeenat block Karachi,F.Sc.,BS
2662,KHR_BS_1462,Annan Satti,Husain Alam Satti,29/Feb/78,M,1996-08-28,A,C,House No.483 St. No.37 Dehri Thata,F.Sc.,BS
2731,KHR_BS_1531,Hadiqa Abuzar,Abuzar Zeeshan,29/Feb/77,F,1996-08-22,A,C,h no. 292 St. No. 38 Koral Umer Kot,HSSC,BS
4266,KHR_BS_4266,Shafaq Ahmed,Ahmed Ajab,29/Feb/82,F,2001-08-21,A,C,Ho. No.405 St. No.57 Doley shah Umer Kot,A level,BS
4749,KHR_BS_4749,Sumiyya Khanzada,Arif Khanzada,29/Feb/83,F,2001-08-03,A,C,house no. 609 St. No. 98 Shah latif town Karachi,F.Sc.,BS
4886,KHR_BS_4886,Khushbakht Khanzada,Sh. Sarwar Khanzada,29/Feb/83,F,2002-08-16,A,I,h no. 681 St. No. 4 Pwd Coloney KARACHI,HSSC,BS
4887,KHR_BS_4887,Shabir Hussaini,Syyed Muazzum Hussaini,29/Feb/83,M,2002-08-19,A,I,Ho. No. 996 St. No. 67 Sultana abad Khr,A level,BS
5438,KHR_BS_5438,Aga Abdus Sammi Khattak,Abdul Qadeer Ikram Khattak,29/Feb/85,M,2003-08-20,A,I,House No. 694 St. No. 13 Cement factory colone...,F.Sc.,BS
6059,KHR_BS_6059,Arsalan Chohan,Rabbi Daud Chohan,29/Feb/86,M,2004-08-18,A,I,House No.285 St. No.23 Dhamiyal Chishtian,F.Sc.,BS


# Store Student data in DB

In [31]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

2020-11-15 14:38:49,264 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 14:38:49,266 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 14:38:49,268 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 14:38:49,268 INFO sqlalchemy.engine.base.Engine ()


In [32]:
std_table = "KHR_Student"
df_std.to_sql(
    std_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=False, 
    dtype=std_col_types)

2020-11-15 14:38:50,417 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("KHR_Student")
2020-11-15 14:38:50,418 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 14:38:50,420 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("KHR_Student")
2020-11-15 14:38:50,421 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 14:38:50,424 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-15 14:38:50,426 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 14:38:50,428 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("KHR_Student")
2020-11-15 14:38:50,429 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 14:38:50,433 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2020-11-15 14:38:50,434 INFO sqlalchemy.engine.base.Engine ('KHR_Student',)
2020-11-15 14:38:50,435 INFO sqlalchemy.engine.base.Engine PRAGMA main.forei

In [33]:
sqlite_connection.close()

In [61]:
def get_create_table_string(tablename, connection):
    sql = """
    select * from sqlite_master where name = "{}" and type = "table"
    """.format(tablename) 
    result = connection.execute(sql)

    create_table_string = result.fetchmany()[0][4]
    return create_table_string

def add_pk_to_create_table_string(create_table_string, colname):
    regex = "(\n.+{}[^,]+)(,)".format(colname)
    return re.sub(regex, "\\1 PRIMARY KEY,",  create_table_string, count=1)

def add_pk_to_sqlite_table(tablename, index_column, connection):
    cts = get_create_table_string(tablename, connection)
    cts = add_pk_to_create_table_string(cts, index_column)
    template = """
    BEGIN TRANSACTION;
        ALTER TABLE {tablename} RENAME TO {tablename}_old_;

        {cts};

        INSERT INTO {tablename} SELECT * FROM {tablename}_old_;

        DROP TABLE {tablename}_old_;

    COMMIT TRANSACTION;
    """

    create_and_drop_sql = template.format(tablename = tablename, cts = cts)
    connection.executescript(create_and_drop_sql)


In [66]:
con = sqlite3.connect("University.db")
add_pk_to_sqlite_table(std_table, "SID", con)
con.close()

# Process BS Course Reg Table

In [3]:
bs_file = data_dir / "Reg_BS_KHR.xls"

In [4]:
COURSE_COLUMN_NAMES = ['SID', 'Course', 'Score', 'Semester', 'Discipline', 'Degree']

In [5]:
reg_col_types = {
    "SID": Text,
    "Courses": String(8),
    "Score": Integer,
    "Semester":  String(10),
    "Year": DateTime,
    "Discipline": String(3),
    "Degree": String(2)
    }

In [6]:
df_bs = pd.read_excel(bs_file)
df_bs["Degree"] = "BS"
df_bs.columns = COURSE_COLUMN_NAMES

In [7]:
df_bs.SID = "KHR_BS_" + df_bs.SID.astype(str)

In [8]:
df_bs.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
0,KHR_BS_0,CS-101,72,Fall94,CS,BS
1,KHR_BS_0,CS-102,78,Fall94,CS,BS
2,KHR_BS_0,CS-103,53,Fall94,CS,BS
3,KHR_BS_0,CS-104,58,Fall94,CS,BS
4,KHR_BS_0,SS-201,93,Fall94,CS,BS


In [9]:
df_bs.describe(include="all")

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
count,57600,57600,57600.0,57600,57600,57600
unique,1200,89,,10,4,1
top,KHR_BS_909,CS-103,,Fall96,SE,BS
freq,48,1200,,7200,14400,57600
mean,,,74.478715,,,
std,,,14.439515,,,
min,,,50.0,,,
25%,,,62.0,,,
50%,,,74.0,,,
75%,,,87.0,,,


# Process MS Course Reg Table

In [10]:
ms_file = data_dir / "Reg_MS_KHR.xls"

In [11]:
df_ms = pd.read_excel(ms_file)
df_ms["Degree"] = "MS"
df_ms.columns = COURSE_COLUMN_NAMES

In [12]:
df_ms.SID = "KHR_MS_" + df_ms.SID.astype(str)

In [13]:
df_ms.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
0,KHR_MS_0,CS-801,79,Fall01,MS-CS,MS
1,KHR_MS_0,CS-802,60,Fall01,MS-CS,MS
2,KHR_MS_0,CS-703,76,Fall01,MS-CS,MS
3,KHR_MS_1,CS-801,50,Fall01,MS-CS,MS
4,KHR_MS_1,CS-802,69,Fall01,MS-CS,MS


# Concat BS and MS Course Reg Tables

In [15]:
df_reg = pd.concat([df_bs, df_ms], ignore_index=True)
df_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
0,KHR_BS_0,CS-101,72,Fall94,CS,BS
1,KHR_BS_0,CS-102,78,Fall94,CS,BS
2,KHR_BS_0,CS-103,53,Fall94,CS,BS
3,KHR_BS_0,CS-104,58,Fall94,CS,BS
4,KHR_BS_0,SS-201,93,Fall94,CS,BS


In [16]:
df_reg['Year'] = df_reg.Semester.str.extract('(\d+)')
df_reg.Semester = df_reg.Semester.str.replace('\d+', '')
df_reg["Year"] = pd.to_datetime(df_reg["Year"], format='%y')

In [17]:
new_order_reg = ['SID', 'Course', 'Score', 'Semester', 'Year', 'Discipline', 'Degree']
df_reg = df_reg[new_order_reg]
df_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
0,KHR_BS_0,CS-101,72,Fall,1994-01-01,CS,BS
1,KHR_BS_0,CS-102,78,Fall,1994-01-01,CS,BS
2,KHR_BS_0,CS-103,53,Fall,1994-01-01,CS,BS
3,KHR_BS_0,CS-104,58,Fall,1994-01-01,CS,BS
4,KHR_BS_0,SS-201,93,Fall,1994-01-01,CS,BS


In [18]:
df_reg.describe(include="all")

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
count,74400,74400,74400.0,74400,74400,74400,74400
unique,2800,120,,2,11,8,2
top,KHR_BS_1021,CS-802,,Spring,1997-01-01 00:00:00,SE,BS
freq,48,1600,,37200,14400,14400,57600
first,,,,,1994-01-01 00:00:00,,
last,,,,,2005-01-01 00:00:00,,
mean,,,74.446505,,,,
std,,,14.438716,,,,
min,,,50.0,,,,
25%,,,62.0,,,,


In [19]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

reg_table = "KHR_Course_Reg"
df_reg.to_sql(
    reg_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=True,
    dtype=reg_col_types)

sqlite_connection.close()

2020-11-15 12:31:31,437 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:31:31,437 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:31:31,438 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:31:31,439 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:31:31,449 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("KHR_Course_Reg")
2020-11-15 12:31:31,449 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:31:31,451 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("KHR_Course_Reg")
2020-11-15 12:31:31,452 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:31:31,454 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-15 12:31:31,455 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:31:31,457 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("KHR_Course_Reg")
2020-11-15 12:31:31,458 INFO

## Process Semester Table

In [149]:
semester_table = "Semesters"
df_sem = pd.DataFrame(df_reg.Semester.unique())
df_sem.columns = ["semester_name"]
df_sem.index.names = ['semester_id']
df_sem

Unnamed: 0_level_0,semester_name
semester_id,Unnamed: 1_level_1
0,Fall94
1,Spring95
2,Fall95
3,Spring96
4,Fall96
5,Spring97
6,Fall97
7,Spring98
8,Fall98
9,Spring99


In [150]:
df_sem.to_sql(
    semester_table, 
    sqlite_connection,
    if_exists='replace'
    )

2020-11-14 18:03:12,949 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("Semesters")
2020-11-14 18:03:12,951 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:03:12,953 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("Semesters")
2020-11-14 18:03:12,955 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:03:12,957 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-14 18:03:12,959 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:03:12,962 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("Semesters")
2020-11-14 18:03:12,963 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:03:12,966 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2020-11-14 18:03:12,968 INFO sqlalchemy.engine.base.Engine ('Semesters',)
2020-11-14 18:03:12,970 INFO sqlalchemy.engine.base.Engine PRAGMA main.foreign_key_l

In [151]:
con = sqlite3.connect("University.db")
add_pk_to_sqlite_table(semester_table, "semester_id", con)
con.close()

In [162]:
sem_map = {x:i for i, x in enumerate(df_sem.to_dict('list')['semester_name'])}
sem_map

{'Fall94': 0,
 'Spring95': 1,
 'Fall95': 2,
 'Spring96': 3,
 'Fall96': 4,
 'Spring97': 5,
 'Fall97': 6,
 'Spring98': 7,
 'Fall98': 8,
 'Spring99': 9,
 'Fall01': 10,
 'Spring02': 11,
 'Fall02': 12,
 'Spring03': 13,
 'Fall03': 14,
 'Spring04': 15,
 'Fall04': 16,
 'Spring05': 17}

In [163]:
df_reg.Semester = df_reg.Semester.map(sem_map)
df_reg.head()

Unnamed: 0,SID,Courses,Score,Semester,Discipline,Degree
0,KHR_BS_0,CS-101,72,0,CS,BS
1,KHR_BS_0,CS-102,78,0,CS,BS
2,KHR_BS_0,CS-103,53,0,CS,BS
3,KHR_BS_0,CS-104,58,0,CS,BS
4,KHR_BS_0,SS-201,93,0,CS,BS


In [168]:
sum(df_reg.Semester.isnull())

0

# Process Courses Table

In [173]:
courses_table = "Courses"
df_courses = pd.DataFrame(df_reg.Courses.unique())
df_courses.columns = ["course_name"]
df_courses.index.names = ['course_id']
df_courses

Unnamed: 0_level_0,course_name
course_id,Unnamed: 1_level_1
0,CS-101
1,CS-102
2,CS-103
3,CS-104
4,SS-201
...,...
115,CS-708
116,CS-968
117,CS-977
118,CS-798


In [174]:
df_courses.to_sql(
    courses_table, 
    sqlite_connection,
    if_exists='replace'
    )

2020-11-14 18:14:10,647 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("Courses")
2020-11-14 18:14:10,649 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:14:10,652 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("Courses")
2020-11-14 18:14:10,653 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:14:10,657 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE "Courses" (
	course_id BIGINT, 
	course_name TEXT
)


2020-11-14 18:14:10,658 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:14:10,665 INFO sqlalchemy.engine.base.Engine COMMIT
2020-11-14 18:14:10,667 INFO sqlalchemy.engine.base.Engine CREATE INDEX "ix_Courses_course_id" ON "Courses" (course_id)
2020-11-14 18:14:10,668 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:14:10,672 INFO sqlalchemy.engine.base.Engine COMMIT
2020-11-14 18:14:10,674 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-11-14 18:14:10,677 INFO sqlalchemy.engine.base.Engine INSERT INTO "Courses" (course_id, course_name) VALUE

In [175]:
con = sqlite3.connect("University.db")
add_pk_to_sqlite_table(courses_table, "course_id", con)
con.close()

In [177]:
course_map = {x:i for i, x in enumerate(df_courses.to_dict('list')['course_name'])}

In [178]:
df_reg.Courses = df_reg.Courses.map(course_map)
df_reg.head()

Unnamed: 0,SID,Courses,Score,Semester,Discipline,Degree
0,KHR_BS_0,0,72,0,CS,BS
1,KHR_BS_0,1,78,0,CS,BS
2,KHR_BS_0,2,53,0,CS,BS
3,KHR_BS_0,3,58,0,CS,BS
4,KHR_BS_0,4,93,0,CS,BS


# Process Disciplines Table

In [183]:
discipline_table = "Disciplines"
df_disciplines = pd.DataFrame(df_reg.Discipline.unique())
df_disciplines.columns = ["discipline_name"]
df_disciplines.index.names = ['discipline_id']
df_disciplines

Unnamed: 0_level_0,discipline_name
discipline_id,Unnamed: 1_level_1
0,CS
1,TC
2,SE
3,CE
4,MS-CS
5,MS-TC
6,MS-SPM
7,MS-NW


In [184]:
df_disciplines.to_sql(
    discipline_table, 
    sqlite_connection,
    if_exists='replace'
    )

2020-11-14 18:24:58,506 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("Disciplines")
2020-11-14 18:24:58,507 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:24:58,509 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("Disciplines")
2020-11-14 18:24:58,511 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:24:58,514 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE "Disciplines" (
	discipline_id BIGINT, 
	discipline_name TEXT
)


2020-11-14 18:24:58,515 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:24:58,520 INFO sqlalchemy.engine.base.Engine COMMIT
2020-11-14 18:24:58,522 INFO sqlalchemy.engine.base.Engine CREATE INDEX "ix_Disciplines_discipline_id" ON "Disciplines" (discipline_id)
2020-11-14 18:24:58,523 INFO sqlalchemy.engine.base.Engine ()
2020-11-14 18:24:58,528 INFO sqlalchemy.engine.base.Engine COMMIT
2020-11-14 18:24:58,531 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-11-14 18:24:58,532 INFO sqlalchemy.engine.base.Engine INSERT INTO "Dis

In [191]:
con = sqlite3.connect("University.db")
add_pk_to_sqlite_table(discipline_table, "discipline_id", con)
con.close()

OperationalError: database is locked

In [188]:
discipline_map = {x:i for i, x in enumerate(df_disciplines.to_dict('list')['discipline_name'])}
discipline_map

{'CS': 0,
 'TC': 1,
 'SE': 2,
 'CE': 3,
 'MS-CS': 4,
 'MS-TC': 5,
 'MS-SPM': 6,
 'MS-NW': 7}