In [70]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualize
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [75]:
st_math = pd.read_csv('student-mat.csv', header = 0)
st_port = pd.read_csv('student-por.csv', header = 0)

In [76]:
st_math['type'] = 'math'
st_port['type'] = 'port'

In [77]:
st_math.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,type
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6,math
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6,math
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10,math
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15,math
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10,math


In [78]:
st_port.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,type
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11,port
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11,port
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12,port
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14,port
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13,port


# Tamanho dos datasets

In [29]:
print("Length math dataset: {} rows".format(len(st_math)))
print("Length port dataset: {} rows".format(len(st_por)))

Length math dataset: 395 rows
Length port dataset: 649 rows


# Alunos que se repetem

No total, temos 382 alunos que aparecem em ambos os datasets.

In [79]:
frames = [st_math, st_port]
result = pd.concat(frames, sort=False)
print("Length result dataset: {} rows".format(len(result)))

Length result dataset: 1044 rows


Temos a coluna `type` que distingue a instância entre matemática e português

In [80]:
print(result[['school','type']].groupby(['type']).count())

      school
type        
math     395
port     649


In [81]:
duplicateRowsDF = result[result.duplicated(['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet'])]
print("Duplicate Rows based on columns are: {}".format(len(duplicateRowsDF)))

Duplicate Rows based on columns are: 382


# Tarefa 1. Instâncias a serem usadas
A partir disso, temos algumas alternativas:
* utilizar somente 382 instâncias 
* utilizar somente dataset de português com 649 instâncias
* utilizar ambos os datasets - preencher missing values com a média da amostra para não alterar a distribuição dos dados

# Tarefa 2. Discretização de variáveis categóricas

Sugestão de leitura: https://www.datacamp.com/community/tutorials/categorical-data

### G3 (target)

* Podemos utilizar a média da turma para definir uma característica `is_approved`

# Conhecendo as características do dataset

In [82]:
print("Length math dataset: {} rows".format(len(st_math)))
print("Length port dataset: {} rows".format(len(st_por)))

Length math dataset: 395 rows
Length port dataset: 649 rows


## 1. Grade G3

In [89]:
for dataset in frames:
    ave = sum(dataset.G3)/float(len(dataset))
    dataset['ave_line'] = ave
    dataset['average'] = ['above average' if i > ave else 'under average' for i in dataset.G3]

In [117]:
print("MAT")
media_g3_mat = st_math['G3'].mean()
desvio_g3_mat = st_math['G3'].std()
print("    Média:", media_g3_mat)
print("    Desvio padrão:", desvio_g3_mat)

print("\nPOR")
media_g3_por = st_port['G3'].mean()
desvio_g3_por = st_port['G3'].std()
print("    Média:", media_g3_por)
print("    Desvio padrão:", desvio_g3_por)

MAT
    Média: 10.415189873417722
    Desvio padrão: 4.5814426109978434

POR
    Média: 11.906009244992296
    Desvio padrão: 3.230656242804805


### Proporção de alunos acima e abaixo da média

In [125]:
for dataset in frames:
    print(str(dataset.type.unique()))
    print(dataset[['average','school']].groupby(['average'], as_index=False).count())
    print("")

['math']
         average  school
0  above average     209
1  under average     186

['port']
         average  school
0  above average     348
1  under average     301



## 2. Age

In [127]:
# POR
for dataset in frames:
    print(str(dataset.type.unique()))
    media = dataset['age'].mean()
    desvio = dataset['age'].std()
    print("Média da idade:", media)
    print("Desvio padrão da idade:", desvio)
    print("")
    print(dataset[['age','school']].groupby(['age'], as_index=False).count())
    print("\n")

['math']
Média da idade: 16.696202531645568
Desvio padrão da idade: 1.2760427246056283

   age  school
0   15      82
1   16     104
2   17      98
3   18      82
4   19      24
5   20       3
6   21       1
7   22       1


['port']
Média da idade: 16.7442218798151
Desvio padrão da idade: 1.2181376394800656

   age  school
0   15     112
1   16     177
2   17     179
3   18     140
4   19      32
5   20       6
6   21       2
7   22       1




## 3. Traveltime

In [132]:
# POR
for dataset in frames:
    print(str(dataset.type.unique()))
    media = dataset['traveltime'].mean()
    desvio = dataset['traveltime'].std()
    print("Média:", media)
    print("Desvio padrão:", desvio)
    print("\n")

['math']
Média: 1.4481012658227848
Desvio padrão: 0.6975047549086825


['port']
Média: 1.568567026194145
Desvio padrão: 0.7486600863534161


