In [1]:
import pandas as pd
import numpy as np
import re
import sqlite3
import glob

from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, String, DateTime, Boolean

In [2]:
data_dir = Path("Assignment 1 Data Set/Peshawar/")

# Student DB

In [3]:
STUDENT_COLUMN_NAMES = ['SID', 'Name', 'Father', 'DoB', 'Gender', 'Reg Date', 'Reg Status', 'Degree Status', 'Address', 'Qualification']

In [4]:
std_col_types = {
    "SID": Text,
    "Name": Text,
    "Father": Text,
    "DoB":  DateTime,
    "Gender": Boolean,
    "Reg Date": DateTime,
    "Reg Status": String(1),
    "Degree Status": String(1),
    "Address": Text,
    "Qualification": Text,
    "Degree": String(2),
    }

## BS Level

In [9]:
bs_student_files = data_dir.rglob('BS_P_*[0-9]_Student.txt')
# list(bs_student_files)

In [10]:
bs_std_dfs = []
for std_file in bs_student_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    bs_std_dfs.append(df) 

In [18]:
df_bs_std = pd.concat(bs_std_dfs, axis=0, ignore_index=True)
df_bs_std = df_bs_std.sort_values(by=['Reg#'])
df_bs_std["Gender"] = None

In [19]:
list(df_bs_std.columns.values)

['Reg#',
 'Name',
 'Father',
 'Address',
 'Date of Birth',
 'lastDeg',
 'Reg Date',
 'Reg Status',
 'Deg Status',
 'Gender']

In [22]:
new_order = [
    'Reg#',
    'Name',
    'Father',
    'Date of Birth',
    'Gender',
    'Reg Date',
    'Reg Status',
    'Deg Status',
    'Address',
    'lastDeg',
    ]

In [23]:
df_bs_std = df_bs_std[new_order]
df_bs_std.columns = STUDENT_COLUMN_NAMES

In [24]:
df_bs_std.SID = "PEW_BS_" + df_bs_std.SID.astype(str)

In [25]:
df_bs_std["Degree"] = "BS"

In [26]:
df_bs_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
1500,PEW_BS_0,Midhat Azhar Durrani,Azhar Durrani,14-4-74,,26-8-94,A,C,H# 447 St. # 5 Air port coloney PSH,A-Level,BS
1501,PEW_BS_1,Mouhammed Abdul Rafay Kazmi,Iffan Kazmi,13-12-74,,13-8-94,A,C,H# 169 s# 0 University coloney PSH,A-Level,BS
1502,PEW_BS_2,Ajab Khan,Sahir Fazal Khan,13-9-76,,17-8-94,A,C,ho. # 484 street # 95 Buliyan wali Sawat,Fsc,BS
1503,PEW_BS_3,Moeen Minhas,Chodhary Zakir Minhas,15-9-76,,20-8-94,A,C,House No. 31 street # 91 Industrial estate Pe...,F.Sc.,BS
1504,PEW_BS_4,Samina Niazi,Sahibzada Javed Niazi,16-6-76,,23-8-94,A,C,Ho. # 345 S# 86 Afindi road psh,Fsc,BS


In [27]:
df_bs_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,3300,3300,3300,3300,0.0,3300,3300,3300,3300,3300,3300
unique,3300,3252,3246,2230,0.0,319,1,2,3299,6,1
top,PEW_BS_665,Munir Niazi,Abdul Hameed Khan,29-4-82,,17-8-00,A,C,H# 447 St. # 5 Air port coloney PSH,A-Level,BS
freq,1,3,2,4,,12,3300,2400,2,552,3300


## MS Level

In [29]:
ms_student_files = data_dir.rglob('MS_P_*[0-9]_Student.txt')
# list(ms_student_files)

In [30]:
ms_std_dfs = []
for std_file in ms_student_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    ms_std_dfs.append(df) 

In [31]:
df_ms_std = pd.concat(ms_std_dfs, axis=0, ignore_index=True)
df_ms_std = df_ms_std.sort_values(by=['Reg#'])
df_ms_std["Gender"] = None

In [32]:
df_ms_std = df_ms_std[new_order]
df_ms_std.columns = STUDENT_COLUMN_NAMES

In [33]:
df_ms_std.SID = "PEW_MS_" + df_ms_std.SID.astype(str)

In [34]:
df_ms_std["Degree"] = "MS"

In [35]:
df_ms_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,PEW_MS_0,Badar Rana Qutab-ud-Din Sehti,Rana Qutab-ud-Din Sehti,28-9-79,,28-8-01,A,C,Ho. # 998 street # 8 Afindi road psh,MSc,MS
1,PEW_MS_1,AAmer Hussain,Hussain Rasheed,13-12-77,,13-8-01,A,C,H# 169 s# 0 Mutian wala Mardan,M.Phil,MS
2,PEW_MS_2,Momd. Imtiaz Paracha,Karam Paracha,13-9-79,,17-8-01,A,C,ho. # 484 street # 95 Lane 3 Khybar road psh,MSc,MS
3,PEW_MS_3,Ghazi Haque,Zameer Haque,15-9-79,,20-8-01,A,C,House No. 31 street # 91 Industrial estate Pe...,BS,MS
4,PEW_MS_4,Nosheen Butt,Viqar-ul-Islam Athar Butt,16-6-79,,23-8-01,A,C,Ho. # 345 S# 86 Cement factory coloney Gilgit,MSc,MS


In [36]:
df_ms_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,400,400,400,400,0.0,400,400,400,400,400,400
unique,400,400,400,341,0.0,116,1,2,400,6,1
top,PEW_MS_251,Zubair Qureshi,Abdul kareem Haroon Satti,2-4-80,,17-8-01,A,C,ho. # 996 S# 44 Lane 6 Mall road psh,MSc,MS
freq,1,1,1,3,,6,400,300,1,69,400


## Concat BS and MS

In [37]:
df_std = pd.concat([df_bs_std, df_ms_std], ignore_index=True)

In [38]:
df_std.Gender = df_std.Gender.map({0: False, 1: True})

In [39]:
df_std["Reg Date"] = pd.to_datetime(df_std["Reg Date"])

In [40]:
df_std["DoB"] = pd.to_datetime(df_std["DoB"])

In [41]:
df_std.head()

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
0,PEW_BS_0,Midhat Azhar Durrani,Azhar Durrani,1974-04-14,,1994-08-26,A,C,H# 447 St. # 5 Air port coloney PSH,A-Level,BS
1,PEW_BS_1,Mouhammed Abdul Rafay Kazmi,Iffan Kazmi,1974-12-13,,1994-08-13,A,C,H# 169 s# 0 University coloney PSH,A-Level,BS
2,PEW_BS_2,Ajab Khan,Sahir Fazal Khan,1976-09-13,,1994-08-17,A,C,ho. # 484 street # 95 Buliyan wali Sawat,Fsc,BS
3,PEW_BS_3,Moeen Minhas,Chodhary Zakir Minhas,1976-09-15,,1994-08-20,A,C,House No. 31 street # 91 Industrial estate Pe...,F.Sc.,BS
4,PEW_BS_4,Samina Niazi,Sahibzada Javed Niazi,1976-06-16,,1994-08-23,A,C,Ho. # 345 S# 86 Afindi road psh,Fsc,BS


In [42]:
df_std.describe(include="all")

Unnamed: 0,SID,Name,Father,DoB,Gender,Reg Date,Reg Status,Degree Status,Address,Qualification,Degree
count,3700,3700,3700,3700,0.0,3700,3700,3700,3700,3700,3700
unique,3700,3518,3372,2406,0.0,319,1,2,3551,12,2
top,PEW_BS_665,Rafi Khan,Islam Kharasani,1980-08-14 00:00:00,,2001-01-08 00:00:00,A,C,House No.158 s no.27 Khumar road Sawat,A-Level,BS
freq,1,3,3,5,,17,3700,2700,2,552,3300
first,,,,1974-01-18 00:00:00,,1994-01-08 00:00:00,,,,,
last,,,,1986-12-29 00:00:00,,2004-12-08 00:00:00,,,,,


# Store Student data in DB

In [43]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

2020-11-15 11:51:06,919 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 11:51:06,919 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:51:06,921 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 11:51:06,922 INFO sqlalchemy.engine.base.Engine ()


In [44]:
std_table = "PEW_Student"
df_std.to_sql(
    std_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=False, 
    dtype=std_col_types)

2020-11-15 11:51:13,983 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("PEW_Student")
2020-11-15 11:51:13,984 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:51:13,988 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("PEW_Student")
2020-11-15 11:51:13,990 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:51:13,994 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE "PEW_Student" (
	"SID" TEXT, 
	"Name" TEXT, 
	"Father" TEXT, 
	"DoB" DATETIME, 
	"Gender" BOOLEAN, 
	"Reg Date" DATETIME, 
	"Reg Status" VARCHAR(1), 
	"Degree Status" VARCHAR(1), 
	"Address" TEXT, 
	"Qualification" TEXT, 
	"Degree" VARCHAR(2), 
	CHECK ("Gender" IN (0, 1))
)


2020-11-15 11:51:13,996 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 11:51:14,002 INFO sqlalchemy.engine.base.Engine COMMIT
2020-11-15 11:51:14,009 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-11-15 11:51:14,059 INFO sqlalchemy.engine.base.Engine INSERT INTO "PEW_Student" ("SID", "Name", "Father", "DoB", "Gender", 

In [45]:
sqlite_connection.close()

# Process BS Course Reg

In [64]:
COURSE_COLUMN_NAMES = ['SID', 'Course', 'Score', 'Semester', 'Year', 'Discipline', 'Degree']

In [84]:
reg_col_types = {
    "SID": Text,
    "Courses": String(8),
    "Score": Integer,
    "Semester":  String(10),
    "Year": DateTime,
    "Discipline": String(3),
    "Degree": String(2)
    }

In [66]:
bs_reg_files = data_dir.rglob('BS_P_*[0-9]_Reg.txt')
# sorted(bs_reg_files)

In [67]:
bs_reg_dfs = []
for std_file in bs_reg_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    bs_reg_dfs.append(df) 

df_bs_reg = pd.concat(bs_reg_dfs, axis=0, ignore_index=True)
df_bs_reg = df_bs_reg.sort_values(by=['Reg#'])
df_bs_reg["Degree"] = "BS"

In [68]:
list(df_bs_reg.columns.values)

['Reg#', 'Courses', 'Score', 'Program', 'Sem', 'Year', 'Degree']

In [69]:
new_order_reg = ['Reg#', 'Courses', 'Score', 'Sem', 'Year', 'Program', 'Degree']
df_bs_reg = df_bs_reg[new_order_reg]
df_bs_reg.columns = COURSE_COLUMN_NAMES
df_bs_reg.SID = "PEW_BS_" + df_bs_reg.SID.astype(str)

In [70]:
df_bs_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
54002,PEW_BS_0,SS-202,96,Fall,1995,CS,BS
57604,PEW_BS_0,CS-119,90,Fall,1996,CS,BS
57605,PEW_BS_0,CS-120,53,Fall,1996,CS,BS
61200,PEW_BS_0,UR-404,89,Fall,1997,CS,BS
61201,PEW_BS_0,CS-125,67,Fall,1997,CS,BS


In [71]:
df_bs_reg.describe(include='all')

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
count,136800,136800,136800.0,136800,136800.0,136800,136800
unique,3300,89,,2,,4,1
top,PEW_BS_2018,CS-107,,Fall,,CS,BS
freq,48,3300,,68400,,34200,136800
mean,,,74.462368,,1894.921053,,
std,,,14.415317,,446.648106,,
min,,,50.0,,0.0,,
25%,,,62.0,,1997.0,,
50%,,,74.0,,2000.0,,
75%,,,87.0,,2003.0,,


# Process MS Course Reg

In [87]:
ms_reg_files = data_dir.rglob('MS_P_*[0-9]_Reg.txt')
# sorted(ms_reg_files)

In [88]:
ms_reg_dfs = []
for std_file in ms_reg_files:
    df = pd.read_csv(std_file, index_col=None, header=0)
    ms_reg_dfs.append(df) 

df_ms_reg = pd.concat(ms_reg_dfs, axis=0, ignore_index=True)
df_ms_reg = df_ms_reg.sort_values(by=['Reg#'])
df_ms_reg["Degree"] = "MS"

In [89]:
df_ms_reg = df_ms_reg[new_order_reg]
df_ms_reg.columns = COURSE_COLUMN_NAMES
df_ms_reg.SID = "PEW_MS_" + df_ms_reg.SID.astype(str)

In [90]:
df_ms_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
0,PEW_MS_0,CS-801,95,Fall,2001,MS-CS,MS
902,PEW_MS_0,CS-968,67,Spring,2003,MS-CS,MS
602,PEW_MS_0,CS-606,61,Fall,2002,MS-CS,MS
601,PEW_MS_0,CS-505,72,Fall,2002,MS-CS,MS
600,PEW_MS_0,CS-401,71,Fall,2002,MS-CS,MS


In [80]:
df_ms_reg.describe(include="all")

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
count,4200,4200,4200.0,4200,4200.0,4200,4200
unique,400,20,,2,,2,1
top,PEW_MS_251,CS-801,,Spring,,MS-CS,MS
freq,12,400,,2100,,2100,4200
mean,,,74.281667,,2003.214286,,
std,,,14.363015,,1.145223,,
min,,,50.0,,2001.0,,
25%,,,62.0,,2002.0,,
50%,,,74.0,,2003.0,,
75%,,,86.25,,2004.0,,


# Concat BS and MS Course Reg Tables

In [92]:
df_reg = pd.concat([df_bs_reg, df_ms_reg], ignore_index=True)
df_reg["Year"] = pd.to_datetime(df_reg["Year"])
df_reg.head()

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
0,PEW_BS_0,SS-202,96,Fall,1970-01-01 00:00:00.000001995,CS,BS
1,PEW_BS_0,CS-119,90,Fall,1970-01-01 00:00:00.000001996,CS,BS
2,PEW_BS_0,CS-120,53,Fall,1970-01-01 00:00:00.000001996,CS,BS
3,PEW_BS_0,UR-404,89,Fall,1970-01-01 00:00:00.000001997,CS,BS
4,PEW_BS_0,CS-125,67,Fall,1970-01-01 00:00:00.000001997,CS,BS


In [93]:
df_reg.describe(include='all')

Unnamed: 0,SID,Course,Score,Semester,Year,Discipline,Degree
count,141000,141000,141000.0,141000,141000,141000,141000
unique,3700,109,,2,13,6,2
top,PEW_BS_2018,CS-109,,Fall,1970-01-01 00:00:00.000002004,CS,BS
freq,48,3300,,70500,15600,34200,136800
first,,,,,1970-01-01 00:00:00,,
last,,,,,1970-01-01 00:00:00.000002005,,
mean,,,74.456986,,,,
std,,,14.413744,,,,
min,,,50.0,,,,
25%,,,62.0,,,,


In [94]:
engine = create_engine('sqlite:///University.db', echo=True)
sqlite_connection = engine.connect()

reg_table = "PEW_Course_Reg"
df_reg.to_sql(
    reg_table, 
    sqlite_connection, 
    if_exists='replace', 
    index=False, 
    dtype=reg_col_types)

sqlite_connection.close()

2020-11-15 12:01:55,136 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:01:55,138 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:01:55,139 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-11-15 12:01:55,141 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:01:55,151 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("PEW_Course_Reg")
2020-11-15 12:01:55,152 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:01:55,153 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("PEW_Course_Reg")
2020-11-15 12:01:55,154 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:01:55,156 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-11-15 12:01:55,157 INFO sqlalchemy.engine.base.Engine ()
2020-11-15 12:01:55,159 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("PEW_Course_Reg")
2020-11-15 12:01:55,160 INFO