In [312]:
import pandas as pd
import numpy as np
import re
import sqlite3
import glob

from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, String, DateTime, Boolean

In [227]:
data_dir = Path("Assignment 1 Data Set/Lahore/")

# Student DB

In [3]:
STUDENT_COLUMN_NAMES = ['SID', 'Name', 'Father', 'DoB', 'Gender', 'Reg Date', 'Reg Status', 'Degree Status', 'Address', 'Qualification']

In [122]:
std_col_types = {
    "SID": Text,
    "Name": Text,
    "Father": Text,
    "DoB":  DateTime,
    "Gender": Boolean,
    "Reg Date": DateTime,
    "Reg Status": String(1),
    "Degree Status": String(1),
    "Address": Text,
    "Qualification": Text,
    "Degree": String(2),
    }

## BS Level

In [175]:
bs_student_files = data_dir.rglob('Lhr_Student_[0-9]*.txt')
# list(bs_student_files)

In [176]:
bs_std_dfs = []
for std_file in bs_student_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    bs_std_dfs.append(df) 

### Add Transfer Student

In [177]:
bs_std_dfs.append(pd.read_csv(data_dir / 'Transfer.txt', index_col=None, header=0))

In [178]:
df_bs_std = pd.concat(bs_std_dfs, axis=0, ignore_index=True)
df_bs_std = df_bs_std.sort_values(by=['SID'])

In [179]:
original_order = list(df_bs_std.columns.values)
original_order

['SID',
 'St_Name',
 'Father_Name',
 'Gender',
 'Address',
 'Date of Birth',
 'Reg Date',
 'Reg Status',
 'Degree Status',
 'Last Degree']

In [180]:
new_order = [
    'SID',
    'St_Name',
    'Father_Name',
    'Date of Birth',
    'Gender',
    'Reg Date',
    'Reg Status',
    'Degree Status',
    'Address',
    'Last Degree'
    ]

In [181]:
df_bs_std = df_bs_std[new_order]
df_bs_std.columns = STUDENT_COLUMN_NAMES

In [182]:
df_bs_std.SID = "LHR_BS_" + df_bs_std.SID.astype(str)

In [183]:
df_bs_std["Degree"] = "BS"

In [184]:
df_bs_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
2000,LHR_BS_0,Hafiz Viqar-ul-Islam Chohan,Raja Sharif Chohan,11-Apr-74,0,10-Aug-94,A,C,House # 855 St. # 37 Shuakat coloney Bahawalpur,FSc,BS
2001,LHR_BS_1,Muazzam Abdul Hameed Shah,Sh. Haseeb Shah,13-Dec-74,0,13-Aug-94,A,C,H# 169 s# 0 Mutian wala Dera ghazi khan,A-Level,BS
2002,LHR_BS_2,Babar Abdul Mujeeb,Abdul Mujeeb Anwer,13-Sep-76,0,17-Aug-94,A,C,ho. # 484 street # 95 Buliyan wali Gujrawala,Fsc,BS
2003,LHR_BS_3,Muhammad Mudassir Abbasi,Khurram Abbasi,15-Sep-76,0,20-Aug-94,A,C,House No. 31 street # 91 Dak ghar Wazirabad,F.Sc.,BS
2004,LHR_BS_4,Hadiqa Durrani,Mahtab Abdus Sattar Durrani,16-Jun-76,1,23-Aug-94,A,C,Ho. # 345 S# 86 Cement factory coloney Sargodha,Fsc,BS


In [185]:
df_bs_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,4401,4401,4401,4399,4400,4401,4401,4401,4401,4401,4401
unique,4401,4309,4305,2933,3,320,2,2,4401,6,1
top,LHR_BS_436,Kishwar Pasha,Jameel Khan,26-Mar-84,0,25-Aug-98,A,C,house # 987 Street # 25 Mujahid block D.I. Khan,F.Sc.,BS
freq,1,4,2,5,2932,16,4400,3200,1,737,4401


## MS Level

In [186]:
ms_student_files = data_dir.rglob('Lhr_Student_MS_[0-9]*.txt')
# list(ms_student_files)

In [187]:
ms_std_dfs = []
for std_file in ms_student_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    ms_std_dfs.append(df) 

In [188]:
df_ms_std = pd.concat(ms_std_dfs, axis=0, ignore_index=True)
df_ms_std = df_ms_std.sort_values(by=['SID'])

In [189]:
new_order = [
    'SID',
    'St_Name',
    'Father_Name',
    'Date of Birth',
    'Gender',
    'Reg Date',
    'Reg Status',
    'Degree Status',
    'Address',
    'Last Degree'
    ]

In [190]:
df_ms_std = df_ms_std[new_order]
df_ms_std.columns = STUDENT_COLUMN_NAMES

In [191]:
df_ms_std.SID = "LHR_MS_" + df_ms_std.SID.astype(str)

In [192]:
df_ms_std["Degree"] = "MS"

In [193]:
df_ms_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,LHR_MS_0,Maryyum Jaffery,Ghulam Mustafa Jaffery,24-Aug-78,1.0,24-Aug-01,A,C,Ho. no. 302 St. no. 9 Pind dadu Dera ghazi khan,M.Sc,MS
1,LHR_MS_1,Firdous Shah,Sahir Rahid Shah,13-Dec-77,0.0,13-Aug-01,A,C,H# 169 s# 0 Mutian wala Dera ghazi khan,M.Phil,MS
2,LHR_MS_2,Arsalan Abuzar Durrani,Mouhammed Ghani Durrani,13-Sep-79,0.0,17-Aug-01,A,C,ho. # 484 street # 95 Buliyan wali Gujrawala,MSc,MS
3,LHR_MS_3,Jibran Abbasi,Sahibzada Tahir Abbasi,15-Sep-79,0.0,20-Aug-01,A,C,House No. 31 street # 91 Dak ghar Wazirabad,BS,MS
4,LHR_MS_4,Hasina Abdul Jabbar,Abdul Jabbar Suleman,16-Jun-79,1.0,23-Aug-01,A,C,Ho. # 345 S# 86 Cement factory coloney Sargodha,MSc,MS


In [194]:
df_ms_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,800,800,800,800,798.0,800,800,800,800,800,800
unique,800,796,799,605,,116,1,2,800,6,1
top,LHR_MS_645,Jamal Shahroze,Amjad Pasha,11-Dec-80,,24-Aug-01,A,C,H# 327 Street No.14 Purana shehar Melsi,BS,MS
freq,1,2,2,3,,9,800,600,1,135,800
mean,,,,,0.333333,,,,,,
std,,,,,0.4717,,,,,,
min,,,,,0.0,,,,,,
25%,,,,,0.0,,,,,,
50%,,,,,0.0,,,,,,
75%,,,,,1.0,,,,,,


## Concat BS and MS

In [195]:
df_std = pd.concat([df_bs_std, df_ms_std], ignore_index=True)

In [196]:
df_std.Gender = df_std.Gender.map({0: False, 1: True})

In [197]:
df_std["Reg Date"] = pd.to_datetime(df_std["Reg Date"])

In [198]:
df_std["DoB"] = pd.to_datetime(df_std["DoB"])

In [199]:
df_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,LHR_BS_0,Hafiz Viqar-ul-Islam Chohan,Raja Sharif Chohan,1974-04-11,False,1994-08-10,A,C,House # 855 St. # 37 Shuakat coloney Bahawalpur,FSc,BS
1,LHR_BS_1,Muazzam Abdul Hameed Shah,Sh. Haseeb Shah,1974-12-13,False,1994-08-13,A,C,H# 169 s# 0 Mutian wala Dera ghazi khan,A-Level,BS
2,LHR_BS_2,Babar Abdul Mujeeb,Abdul Mujeeb Anwer,1976-09-13,False,1994-08-17,A,C,ho. # 484 street # 95 Buliyan wali Gujrawala,Fsc,BS
3,LHR_BS_3,Muhammad Mudassir Abbasi,Khurram Abbasi,1976-09-15,False,1994-08-20,A,C,House No. 31 street # 91 Dak ghar Wazirabad,F.Sc.,BS
4,LHR_BS_4,Hadiqa Durrani,Mahtab Abdus Sattar Durrani,1976-06-16,True,1994-08-23,A,C,Ho. # 345 S# 86 Cement factory coloney Sargodha,Fsc,BS


In [200]:
df_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,5201,5201,5201,5199,5197,5201,5201,5201,5201,5201,5201
unique,5201,4794,4578,3089,2,320,2,2,4402,12,2
top,LHR_BS_436,Kishwar Pasha,Amjad Pasha,1979-02-12 00:00:00,False,2002-08-27 00:00:00,A,C,Ho. # 499 Street No.95 kurang nala Dera ghazi...,F.Sc.,BS
freq,1,4,4,7,3464,24,5200,3800,2,737,4401
first,,,,1974-01-02 00:00:00,,1994-08-01 00:00:00,,,,,
last,,,,1986-12-29 00:00:00,,2005-01-12 00:00:00,,,,,


# Store Student data in DB

In [201]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

2020-11-15 11:28:02,523 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 11:28:02,524 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:28:02,525 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 11:28:02,525 INFO sqlalchemy.engine.base.Engine ()


In [202]:
std_table = "LHR_Student"
df_std.to_sql(
    std_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=False, 
    dtype=std_col_types)

2020-11-15 11:28:03,143 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("LHR_Student")
2020-11-15 11:28:03,144 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:28:03,146 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("LHR_Student")
2020-11-15 11:28:03,148 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:28:03,150 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-15 11:28:03,152 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:28:03,154 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("LHR_Student")
2020-11-15 11:28:03,155 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:28:03,157 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2020-11-15 11:28:03,158 INFO sqlalchemy.engine.base.Engine ('LHR_Student',)
2020-11-15 11:28:03,160 INFO sqlalchemy.engine.base.Engine PRAGMA main.forei

In [203]:
sqlite_connection.close()

# Process BS Course Reg

In [165]:
COURSE_COLUMN_NAMES = ['SID', 'Course', 'Score', 'Semester', 'Discipline', 'Degree']

In [279]:
reg_col_types = {
    "SID": Text,
    "Courses": String(8),
    "Score": Integer,
    "Semester":  String(10),
    "Year": DateTime,
    "Discipline": String(3),
    "Degree": String(2)
    }

In [229]:
bs_reg_files = data_dir.rglob('Lhr_Detail_[0-9]*.txt')
# sorted(bs_reg_files)

In [230]:
bs_reg_dfs = []
for std_file in bs_reg_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    bs_reg_dfs.append(df) 

df_bs_reg = pd.concat(bs_reg_dfs, axis=0, ignore_index=True)
df_bs_reg = df_bs_reg.sort_values(by=['SID'])

In [231]:
list(df_bs_reg.columns.values)

['SID', 'Degree', 'Semester', 'Course', 'Marks', 'Discipline']

In [232]:
new_order_reg = ['SID', 'Course', 'Marks', 'Semester', 'Discipline', 'Degree']
df_bs_reg = df_bs_reg[new_order_reg]
df_bs_reg.columns = COURSE_COLUMN_NAMES
df_bs_reg.SID = "LHR_BS_" + df_bs_reg.SID.astype(str)

In [233]:
df_bs_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
72002,LHR_BS_0,SS-202,94,Fall95,CS,BS
76804,LHR_BS_0,CS-119,59,Fall96,CS,BS
76805,LHR_BS_0,CS-120,67,Fall96,CS,BS
81600,LHR_BS_0,UR-404,97,Fall97,CS,BS
81601,LHR_BS_0,CS-125,66,Fall97,CS,BS


In [234]:
df_bs_reg.describe(include='all')

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
count,182400,182400,182400.0,182400,182400,182400
unique,4400,89,,22,4,1
top,LHR_BS_679,CS-106,,Fall03,CS,BS
freq,48,4400,,9600,45600,182400
mean,,,74.466168,,,
std,,,14.408256,,,
min,,,50.0,,,
25%,,,62.0,,,
50%,,,74.0,,,
75%,,,87.0,,,


# Process MS Course Reg

In [268]:
ms_reg_files = data_dir.rglob('Lhr_Detail_MS_[0-9]*.txt')
# sorted(ms_reg_files)

In [269]:
ms_reg_dfs = []
for std_file in ms_reg_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    ms_reg_dfs.append(df) 

df_ms_reg = pd.concat(ms_reg_dfs, axis=0, ignore_index=True)
df_ms_reg = df_ms_reg.sort_values(by=['SID'])

In [270]:
new_order_reg = ['SID', 'Course', 'Marks', 'Semester', 'Discipline', 'Degree']
df_ms_reg = df_ms_reg[new_order_reg]
df_ms_reg.columns = COURSE_COLUMN_NAMES
df_ms_reg.SID = "LHR_MS_" + df_ms_reg.SID.astype(str)

In [271]:
df_ms_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
0,LHR_MS_0,CS-801,72,Fall20,MS-CS,MS
1802,LHR_MS_0,CS-968,57,Spring20,MS-CS,MS
1801,LHR_MS_0,CS-708,63,Spring20,MS-CS,MS
1800,LHR_MS_0,CS-907,57,Spring20,MS-CS,MS
1202,LHR_MS_0,CS-606,62,Fall20,MS-CS,MS


In [272]:
df_ms_reg.describe(include="all")

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
count,8400,8400,8400.0,8400,8400,8400
unique,800,31,,2,4,1
top,LHR_MS_87,CS-802,,Spring20,MS-NW,MS
freq,12,800,,4200,2100,8400
mean,,,74.509167,,,
std,,,14.415472,,,
min,,,50.0,,,
25%,,,62.0,,,
50%,,,74.0,,,
75%,,,87.0,,,


# Concat BS and MS Course Reg Tables

In [320]:
df_reg = pd.concat([df_bs_reg, df_ms_reg], ignore_index=True)
df_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Discipline,Degree
0,LHR_BS_0,SS-202,94,Fall95,CS,BS
1,LHR_BS_0,CS-119,59,Fall96,CS,BS
2,LHR_BS_0,CS-120,67,Fall96,CS,BS
3,LHR_BS_0,UR-404,97,Fall97,CS,BS
4,LHR_BS_0,CS-125,66,Fall97,CS,BS


In [321]:
df_reg['Year'] = df_reg.Semester.str.extract('(\d+)')
df_reg.Semester = df_reg.Semester.str.replace('\d+', '')
df_reg["Year"] = pd.to_datetime(df_reg["Year"], format='%y')

In [322]:
new_order_reg = ['SID', 'Course', 'Score', 'Semester', 'Year', 'Discipline', 'Degree']
df_reg = df_reg[new_order_reg]

In [324]:
df_reg.tail()

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
190795,LHR_MS_799,CS-802,95,Fall,2020-01-01,MS-NW,MS
190796,LHR_MS_799,CS-703,71,Fall,2020-01-01,MS-NW,MS
190797,LHR_MS_799,CS-801,76,Fall,2020-01-01,MS-NW,MS
190798,LHR_MS_799,CS-404,62,Spring,2020-01-01,MS-NW,MS
190799,LHR_MS_799,CS-307,95,Spring,2020-01-01,MS-NW,MS


In [325]:
df_reg.Semester.isnull()

0         False
1         False
2         False
3         False
4         False
          ...  
190795    False
190796    False
190797    False
190798    False
190799    False
Name: Semester, Length: 190800, dtype: bool

In [326]:
df_reg.describe(include='all')

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
count,190800,190800,190800.0,190800,190800,190800,190800
unique,5200,120,,2,13,8,2
top,LHR_BS_1932,CS-104,,Fall,2004-01-01 00:00:00,CS,BS
freq,48,4400,,95400,19200,45600,182400
first,,,,,1994-01-01 00:00:00,,
last,,,,,2020-01-01 00:00:00,,
mean,,,74.468061,,,,
std,,,14.408539,,,,
min,,,50.0,,,,
25%,,,62.0,,,,


In [327]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

reg_table = "LHR_Course_Reg"
df_reg.to_sql(
    reg_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=False, 
    dtype=reg_col_types)

sqlite_connection.close()

2020-11-15 12:27:10,983 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:27:10,984 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:27:10,986 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:27:10,987 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:27:11,001 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("LHR_Course_Reg")
2020-11-15 12:27:11,002 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:27:11,003 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("LHR_Course_Reg")
2020-11-15 12:27:11,004 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:27:11,006 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-15 12:27:11,007 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:27:11,009 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("LHR_Course_Reg")
2020-11-15 12:27:11,011 INFO