#### Imports

In [49]:
import pandas as pd
import pymongo
import numpy as np
import os
from dotenv import load_dotenv

#### File Reading & Preparation

In [59]:
# read in CSV file
df = pd.read_csv('data/student-mat.csv', delimiter=";")
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [3]:
# look at field names
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [60]:
'''
    Most column names are good.  Making a dictionary to just rename the ones needing it.
'''
col_names = {'Medu':'m_edu', 'Fedu':'f_edu','Pstatus':'p_status','Mjob':'m_job','Fjob':'f_job','Dalc':'d_alc','Walc':'w_alc','G1':'g1','G2':'g2','G3':'g3'}
df = df.rename(columns=col_names)

In [61]:
''' 
    Due to the large number of columns, I made a combined brief listing of the data
    with the field type to prevent having to refer back & forth.
    Ultimately, the field types match the data and no nulls so it's ready to be 
    loaded to a database

    This is handy and I'll store this for future use
'''
data_example = df.head(3).transpose()
frame_info = pd.concat([df.dtypes, df.count()], axis=1)
frame_info.columns = ['Data Type', 'Non-Null Count']
combined = pd.concat([frame_info, data_example], axis=1)
print(combined)

           Data Type  Non-Null Count        0        1        2
school        object             395       GP       GP       GP
sex           object             395        F        F        F
age            int64             395       18       17       15
address       object             395        U        U        U
famsize       object             395      GT3      GT3      LE3
p_status      object             395        A        T        T
m_edu          int64             395        4        1        1
f_edu          int64             395        4        1        1
m_job         object             395  at_home  at_home  at_home
f_job         object             395  teacher    other    other
reason        object             395   course   course    other
guardian      object             395   mother   father   mother
traveltime     int64             395        2        1        1
studytime      int64             395        2        2        2
failures       int64             395    

In [63]:
''''
    Generate and add random GUID strings to the DataFrame
    The reason being that MongoDB requires a unique string index
    such as a GUID instead of integers normally used in many 
    relational databases.  My previous company used GUIDs though in 
    their hosted Oracle databases so this is familiar.
'''
num_rows = len(df)
random_strings = [np.random.bytes(16).hex() for _ in range(num_rows)]
df['id'] = random_strings

In [64]:
# Set the dataframe index to the id column:
df.set_index('id', inplace=True)

In [65]:
df.head()

Unnamed: 0_level_0,school,sex,age,address,famsize,p_status,m_edu,f_edu,m_job,f_job,...,famrel,freetime,goout,d_alc,w_alc,health,absences,g1,g2,g3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
571c02d38507dff5718fff8e6f102eaa,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
3c77f710f9bad43c62a0af84b9ee78ca,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
bd4b81176b3480024b525da94c60f4e7,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
90940ec6b499d7a08db4220800396658,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
863edde0bc0b8b20b40f54580cd5fcc1,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


#### MongoDB Database Connection and Load

In [50]:
# bring in database string with username & password from external file
load_dotenv()
__mongo_url = os.getenv("MONGO_URL")

In [51]:
# connect to database
client = pymongo.MongoClient(__mongo_url)

In [45]:
# Create databases
db = client.db

In [33]:
# run this to clean out the database prior to upload
col_name = 'cards'
db.drop_collection(col_name)

{'nIndexesWas': 1,
 'ns': 'db.cards',
 'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1697676830, 20),
  'signature': {'hash': b'Q\x8f}m#\x12_\x80@\x1fELl\x7fV\x81\x10\xb8\xe7B',
   'keyId': 7246437339700920321}},
 'operationTime': Timestamp(1697676830, 19)}

In [46]:
# Create a collection on the database
student_math = db.student_math

In [47]:
# Try to insert many which works because it's not a lot of rows
student_math.insert_many([df.to_dict()])

<pymongo.results.InsertManyResult at 0x7f081469e170>