In [66]:
import sympy as sy
import scipy as sp
from scipy import optimize as opt
import numpy as np
import warnings
from IPython.core.interactiveshell import InteractiveShell
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"

## Cann a linear model learn the AND and XOR functions?

\begin{equation}
\mathbf{X} = \begin{bmatrix}
1 & 1 & 1\\
1 & 1 & 0\\
1 & 0 & 1\\
1 & 0 & 0\\
\end{bmatrix},
\mathbf{y}_{and} = \begin{bmatrix}
1 \\
0 \\
0 \\
0 \\ \end{bmatrix}
\end{equation}

\begin{equation}
\mathbf{X} = \begin{bmatrix}
1 & 1 & 1\\
1 & 1 & 0\\
1 & 0 & 1\\
1 & 0 & 0\\
\end{bmatrix},
\mathbf{y}_{xor} = \begin{bmatrix}
0 \\
1 \\
1 \\
0 \\ \end{bmatrix}
\end{equation}

Will use mean squared error for now, but notice binary ouput should be measured with binary cross entropy

In [67]:
X = sy.Matrix([[1,1,1],
               [1,1,0],
               [1,0,1],
               [1,0,0]])
y_xor = sy.Matrix([[0],
                   [1],
                   [1],
                   [0]])
y_and = sy.Matrix([[1],
                   [0],
                   [0],
                   [0]])
w1,w2,w3 = sy.symbols("w_1,w_2,w_3")
w_xor = sy.Matrix([[w1],[w2],[w3]])

The assumed model is then:

\begin{equation}
\mathbf{X}\mathbf{w} = \begin{bmatrix}
1 & 1 & 1\\
1 & 1 & 0\\
1 & 0 & 1\\
1 & 0 & 0\\
\end{bmatrix} 
\begin{bmatrix}
w_1 \\
w_2 \\
w_3 \\ \end{bmatrix} =\mathbf{y}
\end{equation}

defining cost function for linear model:

In [77]:
def J(w,X, Y):
    '''Cost function for linear model with mse'''
    m = (1/4 * (X@w -Y).T @ (X@w - Y)).mean(0) 
    return sum(m) /len(m)


def opts(J,w0, X,Y):
    return opt.minimize(J, w0, args=(X,Y))


def predictor(x,w_s):
    return X@w_s

def results(y, fixed_init=True):
    w0_r = np.random.uniform(low=0.0, high=1.0, size=(3,1))
    w0_f = np.array([[1/2],
                     [1/2],
                     [1/2]])
    w0 = w0_f if fixed_init else w0_r
    res = opts(J,
             w0,
             np.array(X).astype(np.float),
             np.array(y).astype(np.float))
    y_hat = predictor(X,res.x)
    print("True:\n", y)
    print("Pred:\n", sy.Matrix(y_hat))
    print("\n")
    
for gate,y in zip(["AND","XOR"],[y_and, y_xor]):
    print("*"*10,"Gate:", gate,"*"*10)
    results(y)
# Grad(J(X,w_xor,y_xor), w_xor)

********** Gate: AND **********
True:
 Matrix([[1], [0], [0], [0]])
Pred:
 Matrix([[0.499999998947900], [0.249999995373210], [0.249999995373210], [-8.20147927349524e-9]])


********** Gate: XOR **********
True:
 Matrix([[0], [1], [1], [0]])
Pred:
 Matrix([[0.833333337170653], [0.500000001636226], [0.500000001636226], [0.166666666101800]])




### Notice!
no matter how you initialise the optimization process, the XOR function will always have a degeneracy (equal output values for different inputs) and hence it can not be modelled with a shallow model.  

Think about it this way: Look the the values of the pred matrix. Where can you cut the axis on which this values sit, in order to group and separate the outputs to either side of the cut? Impossible for the XOR gate with this model.