ENEM Study - How is education performing in Brazil?
===========================================

Overview
-------------
This is a report that compares ENEM (Exame Nacional do Ensino Médio - High School National Exam) with other metrics in order to analyse:

* statistical accuracy
* improvement over years
* comparison among different regions of the country

The project consists of different steps of data manipulation and is a good Data Science exercise.

Data Source
-----------------

We will be gathering our data from http://inep.gov.br/microdados , where they are available for download on the form of a zipped csv file. As you might have already noticed, each file has around 3GB of size, so we will have to use a Database to perform our queries and extract data of interest. To keep it simple, we will be using PostGRESQL.

In [1]:
import db
import sqlalchemy

session = db.get_db_session(create_schema=True)

In [2]:
import os
import os.path
import csv

csv_path = 'Data/microdados_enem2017/DADOS/MICRODADOS_ENEM_2017.csv'
with open(csv_path,'r') as f:
    reader = csv.reader(f)
    header = next(reader)[0].split(';')
    counter = 0
    for row in reader:
        values = row[0].split(';')
        items = dict(zip(header,values))
        
        for key in items.keys():
            if items[key]=='':
                items[key]=None
                
        #checks if the person finished the exam, those who did not finish will be discarded
        if items['NU_NOTA_CN'] and items['NU_NOTA_CH'] and items['NU_NOTA_LC'] and items['NU_NOTA_MT'] and items['NU_NOTA_REDACAO']:
            exame = db.Exame()
            exame.candidato_id = int(items['NU_INSCRICAO']) 
            exame.ano = int(items['NU_ANO']) if items['NU_ANO'] else None
            exame.idade = int(items['NU_IDADE']) if items['NU_IDADE'] else None
            exame.racial_id = int(items['TP_COR_RACA']) if items['TP_COR_RACA'] else None
            exame.sexo = items['TP_SEXO']
            exame.casado_id = int(items['TP_ESTADO_CIVIL']) if items['TP_ESTADO_CIVIL'] else None
            exame.nacional_id = int(items['TP_NACIONALIDADE']) if items['TP_NACIONALIDADE'] else None
            exame.nota_cn = float(items['NU_NOTA_CN'])
            exame.nota_ch = float(items['NU_NOTA_CH'])
            exame.nota_lc = float(items['NU_NOTA_LC'])
            exame.nota_mt = float(items['NU_NOTA_MT']) 
            exame.nota_red = float(items['NU_NOTA_REDACAO']) 


            if items['CO_MUNICIPIO_PROVA']:
                if not session.query(db.Local).filter_by(id=int(items['CO_MUNICIPIO_PROVA'])).all():
                    local = db.Local()
                    local.id = int(items['CO_MUNICIPIO_PROVA'])
                    local.municipio = items['NO_MUNICIPIO_PROVA']
                    local.estado = items['SG_UF_PROVA']
                    session.add(local)
                    session.commit()
                exame.exame_local_id = int(items['CO_MUNICIPIO_PROVA']) 


            if items['CO_MUNICIPIO_NASCIMENTO']:
                if not session.query(db.Local).filter_by(id=int(items['CO_MUNICIPIO_NASCIMENTO'])).all():
                    local = db.Local()
                    local.id = int(items['CO_MUNICIPIO_NASCIMENTO'])
                    local.municipio = items['NO_MUNICIPIO_NASCIMENTO']
                    local.estado = items['SG_UF_NASCIMENTO']
                    session.add(local)
                    session.commit()
                exame.local_nasc_id = int(items['CO_MUNICIPIO_NASCIMENTO'])

            if items['CO_MUNICIPIO_RESIDENCIA']:
                if not session.query(db.Local).filter_by(id=int(items['CO_MUNICIPIO_RESIDENCIA'])).all():
                    local = db.Local()
                    local.id = int(items['CO_MUNICIPIO_RESIDENCIA'])
                    local.municipio = items['NO_MUNICIPIO_RESIDENCIA']
                    local.estado = items['SG_UF_RESIDENCIA']
                    session.add(local)
                    session.commit()
            exame.residencia_id = int(items['CO_MUNICIPIO_RESIDENCIA'])

            session.add(exame)
            counter += 1
            
            if counter%20000 == 0:
                session.commit()
                print('Inserted: '+str(counter))

        session.commit()    #commits to session
        session.close()     #closes session


Inserted: 20000
Inserted: 40000
Inserted: 60000
Inserted: 80000
Inserted: 100000
Inserted: 120000
Inserted: 140000
Inserted: 160000
Inserted: 180000
Inserted: 200000
Inserted: 220000
Inserted: 240000
Inserted: 260000
Inserted: 280000
Inserted: 300000
Inserted: 320000
Inserted: 340000
Inserted: 360000
Inserted: 380000
Inserted: 400000
Inserted: 420000
Inserted: 440000
Inserted: 460000
Inserted: 480000
Inserted: 500000
Inserted: 520000
Inserted: 540000
Inserted: 560000
Inserted: 580000
Inserted: 600000
Inserted: 620000
Inserted: 640000
Inserted: 660000
Inserted: 680000
Inserted: 700000
Inserted: 720000
Inserted: 740000
Inserted: 760000
Inserted: 780000
Inserted: 800000
Inserted: 820000
Inserted: 840000
Inserted: 860000
Inserted: 880000
Inserted: 900000
Inserted: 920000
Inserted: 940000
Inserted: 960000
Inserted: 980000
Inserted: 1000000
Inserted: 1020000
Inserted: 1040000
Inserted: 1060000
Inserted: 1080000
Inserted: 1100000
Inserted: 1120000
Inserted: 1140000
Inserted: 1160000
Inserted: