In [11]:
import pymongo
import operator
import numpy as np
from pymongo import MongoClient
import pandas as pd

In [12]:
# Load dataset
client = MongoClient('localhost', 27017)
db = client.nesoi
terrain = db.terrain
terrain.count() # Should be 22639

22639

In [13]:
def idxToCoord(idx): 
    x = idx % 16
    y = idx / 16
    return x, y  

def getPairs(idx, idx2):
    
# Validate pairs of cells.
# Show all terrain for cell index 0 and 1
    
    x, y = idxToCoord(idx)

    data = []
    for cell in terrain.find({"idx":idx}):
        
        # find x.y offset between idx 1 and idx2
        x2, y2 = idxToCoord(idx2)
        delx = x2-x
        dely = y2-y
        

        # Find the origin cell for this page  
        cell2 = terrain.find_one({"x":cell["x"] + delx, "y":cell["y"] + dely})

        data.append((cell["terrain"], cell2["terrain"]))
    #     print cell["terrain"], cell2["terrain"]

    df = pd.DataFrame(data)

    # Add Column titles
    df.columns = ["Cell0", "Cell1"]
    df['Count'] = 1
    
    return df

In [14]:
def createObservationDisribution(idx, idx2):
    df = getPairs(0,1)
    df_pivot = df.pivot_table(index='Cell0', columns='Cell1', values='Count', aggfunc='sum')
    return df_pivot
    

In [15]:
df_pivot = createObservationDisribution(0, 1)

In [16]:
df_pivot

Cell1,bushGreen,bushRed,dirt,mountainGreen,mountainRed,mountainWhite,sand,waterGreen,waterRed
Cell0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bushGreen,15.0,,3.0,,,,,,
bushRed,,10.0,,,,,,,
mountainGreen,,,,14.0,,,,,
mountainRed,,,,,64.0,,1.0,,
mountainWhite,,,,,,8.0,,,
waterGreen,,,,,,,,6.0,
waterRed,,,,,3.0,,,,4.0


In [17]:
# Change Observations into Probablities
df_pivot_prob = df_pivot / df_pivot.sum().sum()
df_pivot_prob.fillna(0, inplace=True)
df_pivot_prob

Cell1,bushGreen,bushRed,dirt,mountainGreen,mountainRed,mountainWhite,sand,waterGreen,waterRed
Cell0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bushGreen,0.117188,0.0,0.023438,0.0,0.0,0.0,0.0,0.0,0.0
bushRed,0.0,0.078125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mountainGreen,0.0,0.0,0.0,0.109375,0.0,0.0,0.0,0.0,0.0
mountainRed,0.0,0.0,0.0,0.0,0.5,0.0,0.007812,0.0,0.0
mountainWhite,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0
waterGreen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046875,0.0
waterRed,0.0,0.0,0.0,0.0,0.023438,0.0,0.0,0.0,0.03125


## Background

If *A* and *B* are dependent events, the probablity of this event happening can be calcluated as:

**P(A ∩ B) = P(A) * P(B|A)**

If A and B are independent events, the probabilty of this event happening can be calculated as:

**P(A ∩ B) = P(A) * P(B)**

Conditional Probability for two dependent events is

**P(B | A) = P(B ∩ A) / P(A)**

Conditional Probablity for to independent events is

**P(B | A) = P(B) **


## Experiment 1

The cell0 value is inspected. The cell1 value is inspected. What is the probability that cell0 value will be mountainRed and cell1 value will be mountainRed?

We can use the observation distribution values to see how often this happens

In [18]:
df_pivot.loc["mountainRed", "mountainRed"] 

64.0

Turn this into a probablity by dividing by the total possible outcomes

In [19]:
c0mr_intersection_c1m = df_pivot.loc["mountainRed", "mountainRed"]  / df_pivot.sum().sum()
c0mr_intersection_c1m

0.5

Now we know the pobabilty through the observation distribution. But are these two cells dependent or independent?

If the cells are independent then P(cell0=mountainRed ∩ cell1=mountainRed) = P(cell0=mountainRed) * P(cell1=mountainRed)

### What is P(A=a)
Let's find the probabilty of just cell0=mountainRed. Looking at the observations we have

In [20]:
print df_pivot.sum(1)
print df_pivot.sum(1).sum()

Cell0
bushGreen        18.0
bushRed          10.0
mountainGreen    14.0
mountainRed      65.0
mountainWhite     8.0
waterGreen        6.0
waterRed          7.0
dtype: float64
128.0


In [21]:
c0_mr = df_pivot.sum(1)["mountainRed"] / df_pivot.sum(1).sum()
c0_mr

0.5078125

### What is P(B=b)

We can do the same to find when *cell1* = *mountainRed*

In [22]:
print df_pivot.sum(0)
print df_pivot.sum(0).sum()

Cell1
bushGreen        15.0
bushRed          10.0
dirt              3.0
mountainGreen    14.0
mountainRed      67.0
mountainWhite     8.0
sand              1.0
waterGreen        6.0
waterRed          4.0
dtype: float64
128.0


In [23]:
c1_mr = df_pivot.sum(0)["mountainRed"] / df_pivot.sum(0).sum()
c1_mr

0.5234375

### What is P(B=b|A=a)

We can look at the observations of cell1 when cell0 = mountainRed and calculate when cell1 is mountainRed

In [123]:
print df_pivot.loc["mountainRed"] 
print df_pivot.loc["mountainRed"].sum()

c1mr_given_c0mr = (df_pivot.loc["mountainRed"] /  df_pivot.loc["mountainRed"].sum())["mountainRed"]
print c1mr_given_c0mr


Cell1
bushGreen         NaN
bushRed           NaN
dirt              NaN
mountainGreen     NaN
mountainRed      64.0
mountainWhite     NaN
sand              1.0
waterGreen        NaN
waterRed          NaN
Name: mountainRed, dtype: float64
65.0
0.984615384615


### Are P(A=a) and P(B=b) independent

If they are independend then P(A ∩ B) = P(A) * P(B)

Multipling the two does not give what we are looking for

In [24]:
print c0_mr * c1_mr
print c0_mr * c1_mr == c0mr_intersection_c1m

0.265808105469
False


### Are P(A=a) and P(B=b) dependent

If they are independend then P(A ∩ B) = P(A) * P(B|A)

Multiplying P(cell0=mountainRed) and P(cell1=mountainRed | cell0=mountainRed) gives:

In [126]:
c0_mr * c1mr_given_c0mr == c0mr_intersection_c1m

True