In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch, K2Score, BicScore
from pgmpy.sampling.Sampling import BayesianModelSampling
from pgmpy.factors.discrete.CPD import TabularCPD
from pgmpy.estimators import BayesianEstimator, ParameterEstimator, MaximumLikelihoodEstimator

def JSD(P, Q):
    _P = P / np.linalg.norm(P, ord=1)
    _Q = Q / np.linalg.norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [2]:
data = pd.DataFrame.from_csv('crash_sample_2018.csv', index_col=None)
data.drop(data.columns[0], axis=1, inplace=True)
nodes = list(data.columns)
cat_to_num = {'AvgSpeed' : {'low' :0, 'high':1}, \
              'Country' : {'US':0, 'UK':1, 'Europe':2}, \
              'DangerLvl' : {'low':0, 'high':1}, \
              'NoAccidents' : {'low':0, 'medium':1, 'high':2}, \
              'NoFatalities' : {'low':0, 'medium':1, 'high':2}, \
              'NoJourneys' : {'low':0, 'medium':1, 'high':2}, \
              'PoliceActivity' : {'regular':0, 'increased':1}, \
              'RoadCond' : {'bad':0,'good':1}, \
              'Season' : {'winter':0, 'spring':1, 'summer':2, 'fall':3}, \
              'Weather' : {'bad':0, 'good':1}, \
              'Weekend' : {'working':0, 'weekend':1, 'holiday':2},
             }

num_to_cat = {}
for k1,v1 in cat_to_num.items():
    num_to_cat[k1] = { v2 : k2 for k2, v2 in v1.items()}

  """Entry point for launching an IPython kernel.


In [3]:
# splitting data

# select rows with missing data only
test_data = data[data.isnull().any(axis=1)]

train_data = data.dropna(axis=0)
# get count of each combination in the data
# data.groupby(nodes).size()

# replace strings with numbers and back
# data = data.replace(num_to_cat).replace(cat_to_num)
# data.sort_values('NoJourneys', ascending=False)

connections = [('Weekend', 'NoJourneys'), ('Season', 'NoJourneys'), ('Weather', 'RoadCond'), ('RoadCond', 'NoFatalities'),
               ('PoliceActivity', 'NoFatalities'), ('NoJourneys', 'NoAccidents'), ('Country', 'NoFatalities'), ('AvgSpeed', 'NoFatalities'),
               ('DangerLvl', 'NoFatalities'), ('NoAccidents', 'NoFatalities')]


In [6]:
# def EM(data, iter=1000):
import time
data = test_data.head(100)
iters = 1

null_idx = pd.isnull(data)
predicting = [] #.copy()
filling = data[data.isnull().any(axis=1)] #data[null_idx].copy()
model = BayesianModel(connections)
# model.fit(predicting, estimator=MaximumLikelihoodEstimator)
it = 0

#predrop
na_removed = {}
for k,row in filling.iterrows():
    #if null
    na_removed[k] = row.dropna(axis=0).to_frame().T

while it < iters and not filling.equals(predicting):
    it += 1
    predicting = filling.copy()
    
    mle = MaximumLikelihoodEstimator(model,data)
    [model.add_cpds(cpd) for cpd in mle.get_parameters()]
    
    filling[null_idx] = np.nan
    t0 = time.clock()
    print('fit: ', time.clock() - t0)
    for k,v in filling.iterrows():
#         v.dropna(axis=0, inplace=True)
        t0 = time.clock()
        value = model.predict(na_removed[k])
        print(na_removed[k])
        print('predict: ', time.clock() - t0)

        for col_name in value.columns:
            filling.at[k, col_name] = value[col_name].values[0]
    print('iteration: ', it, time.clock() - t0)
                
filling

fit:  4.999999999810711e-06
   Weekend Season Weather RoadCond PoliceActivity NoJourneys Country  \
0  holiday   fall    good     good        regular        low      UK   

  DangerLvl NoAccidents NoFatalities  
0       low        high       medium  
predict:  0.128737000000001
   Weekend Weather RoadCond PoliceActivity NoJourneys Country AvgSpeed  \
1  weekend     bad      bad        regular     medium  Europe      low   

  DangerLvl NoAccidents NoFatalities  
1       low      medium          low  
predict:  0.143478
   Weekend  Season RoadCond PoliceActivity NoJourneys Country DangerLvl  \
3  weekend  spring     good        regular     medium      UK       low   

  NoAccidents NoFatalities  
3         low          low  
predict:  0.14496300000000062
   Weekend  Season RoadCond PoliceActivity NoJourneys Country AvgSpeed  \
8  weekend  winter      bad        regular     medium      UK      low   

  DangerLvl NoAccidents NoFatalities  
8       low         low         high  
predict: 

    Weekend  Season Weather RoadCond PoliceActivity Country AvgSpeed  \
88  working  winter    good      bad      increased  Europe      low   

   DangerLvl NoAccidents NoFatalities  
88       low         low          low  
predict:  0.12578500000000048
    Weekend  Season Weather RoadCond PoliceActivity NoJourneys Country  \
89  working  winter     bad      bad        regular     medium      US   

   AvgSpeed NoAccidents NoFatalities  
89      low         low          low  
predict:  0.19879199999999742
    Season Weather RoadCond PoliceActivity Country AvgSpeed DangerLvl  \
94  summer     bad      bad        regular  Europe      low       low   

   NoAccidents NoFatalities  
94         low          low  
predict:  0.15721099999999666
    Weekend Season Weather RoadCond NoJourneys Country AvgSpeed DangerLvl  \
95  working   fall     bad      bad     medium  Europe      low       low   

   NoAccidents NoFatalities  
95         low          low  
predict:  0.1484240000000021
    Wee

KeyboardInterrupt: 

In [154]:
filling

Unnamed: 0,Weekend,Season,Weather,RoadCond,PoliceActivity,NoJourneys,Country,AvgSpeed,DangerLvl,NoAccidents,NoFatalities
0,holiday,fall,good,good,regular,low,UK,high,low,high,medium
1,weekend,fall,bad,bad,regular,medium,Europe,low,low,medium,low
3,weekend,spring,bad,good,regular,medium,UK,high,low,low,low
8,weekend,winter,bad,bad,regular,medium,UK,low,low,low,high
9,weekend,fall,good,bad,regular,low,Europe,low,high,low,medium
11,working,fall,good,good,increased,medium,US,high,high,low,low
18,weekend,winter,bad,bad,regular,low,US,low,high,medium,low
27,weekend,winter,bad,bad,increased,low,UK,low,low,medium,high
31,weekend,summer,good,good,regular,medium,US,low,low,high,low
33,weekend,fall,bad,bad,regular,low,Europe,high,low,low,low


In [186]:
model = BayesianModel(connections)
model.fit(data, estimator=BayesianEstimator, prior_type="BDeu")
# train_data.drop(train_data.columns[0], axis=1, inplace=True)
first_row = test_data.copy().head(1)
idx = first_row.isnull()
first_row.dropna(axis=1, inplace=True)
val = model.predict(first_row)
# print(val)
# print(idx)
# print(first_row)
first = test_data.head(1).copy()

for k,v in idx.items():
    print(k)
    if v[0]:
        first.at[0, k] = val
print(first)

# print(model.is_active_trail('Season', 'NoFatalities'))
# print(model.is_active_trail('Weather', 'NoAccidents', 'RoadCond'))
# print(model.is_active_trail('Season', 'Weekend', 'NoAccidents'))

# for c in connections:
#     for c2 in connections:
#         if c == c2:
#             continue
#         print(model.is_active_trail(c[0],c2[0]), c[0], c2[0])

Weekend
Season
Weather
RoadCond
PoliceActivity
NoJourneys
Country
AvgSpeed
DangerLvl
NoAccidents
NoFatalities
   Weekend Season Weather RoadCond PoliceActivity NoJourneys Country  \
0  holiday   fall    good     good        regular        low      UK   

                AvgSpeed DangerLvl NoAccidents NoFatalities  
0    AvgSpeed
0      low       low        high       medium  


In [354]:

import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                      columns=['A', 'B', 'C', 'D', 'E'])
train_data = values[:800]

predict_data = values[800:]
model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])
model.fit(values)
predict_data = predict_data.copy()
predict_data.drop('E', axis=1, inplace=True)
y_pred = model.predict(predict_data)
y_pred


Unnamed: 0,E
800,0
801,0
802,0
803,1
804,1
805,0
806,1
807,0
808,1
809,1


In [279]:
fat = 'NoFatalities'
# pe = ParameterEstimator(model, data)
# print(pe.state_counts(fat))

# mle = MaximumLikelihoodEstimator(model, data) #.sample(200)
# print(mle.estimate_cpd(fat))
# model.fit(data, estimator=BayesianEstimator, prior_type="BDeu")
# model.get_independencies()
# cpds = model.get_cpds()
model.predict(test_data)

# print(len(cpds))
# print(cpds[6])

# for cpd in cpds:
#     cpd.
#     print(cpd.get_values())


# q = mle.estimate_cpd(fat)
# be = BayesianEstimator(model, data)
# print(be.estimate_cpd('NoFatalities',prior_type='BDeu', equivalent_sample_size=10))

# print(q.get_values())
# flat = [x for sublist in q.get_values() for x in sublist]

# from pgmpy.factors.discrete import JointProbabilityDistribution as JPD
# prob = JPD(q.variables, q.cardinality, q.get_values())


ValueError: Data has variables which are not in the model