# Pretreatment with Pandas

In [1]:
import pandas

# data file is in the ../resources folder
datas=pandas.read_csv("../resources/irisData.csv")

datas.describe()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [2]:
datas.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class of iris
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Discretization

In [3]:
import numpy
disc=6 # Disc(retization) may be between 2 and 9
r=numpy.array(range(disc+1))/(1.0*disc)

# quantiles are building using pandas.qcut
# The "class" column is just copied.
l=[]
for col in datas.columns.values:
    if col!="class of iris":
        l.append(pandas.DataFrame(pandas.qcut(datas[col],r),columns=[col]))
    else:
        l.append(pandas.DataFrame(datas[col],columns=[col]))
treated=pandas.concat(l, join='outer', axis=1)

treated.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class of iris
0,"(5, 5.4]","(3.4, 4.4]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
1,"[4.3, 5]","(2.9, 3]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
2,"[4.3, 5]","(3, 3.2]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
3,"[4.3, 5]","(3, 3.2]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
4,"[4.3, 5]","(3.4, 4.4]","[1, 1.5]","[0.1, 0.2]",Iris-setosa


In [4]:
#we put the discretized values in a csv file in order to be learned by pyAgrum
treated.to_csv("irisTreated.csv",index=False)

from pyAgrum.lib._utils.oslike import wc_l,head
print("=================================\n  Size of the generated database\n=================================")
wc_l("irisTreated.csv")
print("\n=================================\n  First lines\n=================================")
head("irisTreated.csv")

  Size of the generated database
151

  First lines
sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class of iris
"(5, 5.4]","(3.4, 4.4]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"[4.3, 5]","(2.9, 3]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"[4.3, 5]","(3, 3.2]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"[4.3, 5]","(3, 3.2]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"[4.3, 5]","(3.4, 4.4]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"(5, 5.4]","(3.4, 4.4]","(1.5, 2.633]","(0.2, 0.867]",Iris-setosa
"[4.3, 5]","(3.2, 3.4]","[1, 1.5]","(0.2, 0.867]",Iris-setosa
"[4.3, 5]","(3.2, 3.4]","[1, 1.5]","[0.1, 0.2]",Iris-setosa
"[4.3, 5]","(2.7, 2.9]","[1, 1.5]","[0.1, 0.2]",Iris-setosa



# Learning with pyAgrum

In [5]:
%matplotlib inline
from pylab import *
import matplotlib.pyplot as plt

import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
gum.about()


pyAgrum version 0.10.3.9
(c) Pierre-Henri Wuillemin, Christophe Gonzales, Lionel Torti
    UPMC 2015

    This is free software; see the source code for copying conditions.
    There is ABSOLUTELY NO WARRANTY; not even for MERCHANTABILITY or
    FITNESS FOR A PARTICULAR PURPOSE.  For details, see 'pyAgrum.warranty'.
    


In [6]:
learner=gum.BNLearner("irisTreated.csv")
learner.useLocalSearchWithTabuList()
bn_iris=learner.learnBN()
print("Learned in {0}ms".format(1000*learner.currentTime()))
gnb.showBN(bn_iris)

Learned in 17.141ms


In [7]:
learner=gum.BNLearner("irisTreated.csv")
learner.useGreedyHillClimbing()
bn_iris2=learner.learnBN()
print("Learned in {0}ms".format(1000*learner.currentTime()))
gnb.showBN(bn_iris2)

Learned in 2.17ms


In [8]:
learner.names()

('sepal length in cm',
 'sepal width in cm',
 'petal length in cm',
 'petal width in cm',
 'class of iris')

In [9]:
print(bn_iris.variableFromName('sepal length in cm'))

sepal length in cm<"(5, 5.4]","[4.3, 5]","(5.4, 5.8]","(6.7, 7.9]","(6.3, 6.7]","(5.8, 6.3]">
