In [1]:
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [23]:
h = 1
sd = 1
n = 5

def gen_data(n, h, sd1, sd2):
    x1 = ss.norm.rvs(-h, sd1, n)
    y1 = ss.norm.rvs(0, sd1, n)
    x2 = ss.norm.rvs(h, sd2, n)
    y2 = ss.norm.rvs(0, sd2, n)
    return (x1, y1, x2, y2)

In [24]:
(x1, y1, x2, y2) = gen_data(50, 1, 1, 1.5)

In [25]:
(x1, y1, x2, y2) = gen_data(1000, 1.5, 1, 1.5)

In [66]:
def plot_data(x1, y1, x2, y2):
    plt.figure()
    plt.plot(x1, y1, "o", ms=2)
    plt.plot(x1, y1, "o", ms=2)
    plt.xlabel("$X_1$")
    plt.ylabel("$X_2$")

In [67]:
plot_data(x1, y1, x2, y2)

<IPython.core.display.Javascript object>

In [62]:
def prob_to_odds(p):
    if p <= 0 or p >= 1:
        print("Probabilities must be between 0 and 1.")
    return p / (1-p) 

In [63]:
prob_to_odds(0.2)

0.25

In [64]:
prob_to_odds(0.8)

4.000000000000001

In [65]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [32]:
np.vstack((x1, y1,))

array([[-0.51325014, -2.93108707, -1.25555141, ...,  2.56600176,
        -1.10705187, -2.51373317],
       [ 0.66816332,  0.58049432,  1.05374566, ..., -0.80583291,
        -0.24854016, -0.4581896 ]])

In [33]:
np.vstack((x1, y1,)).shape

(2, 1000)

In [34]:
np.vstack((x1, y1,)).T.shape

(1000, 2)

In [35]:
X = np.vstack((np.vstack((x1, y1,)).T, np.vstack((x2, y2,)).T))

In [36]:
X.shape

(2000, 2)

In [39]:
n = 1000
y = np.hstack((np.repeat(1,n), np.repeat(2,n)))

In [40]:
y.shape

(2000,)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=1)

In [42]:
X_train.shape

(1000, 2)

In [43]:
X_test.shape

(1000, 2)

In [44]:
y_test.shape

(1000,)

In [45]:
y_train.shape

(1000,)

In [46]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
clf.score(X_test, y_test)

0.882

In [50]:
clf.predict_proba(np.array([-2, 0]).reshape(1, -1))

array([[0.96846351, 0.03153649]])

In [51]:
clf.predict(np.array([-2, 0]).reshape(1, -1))

array([1])

In [52]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
def plot_probs(ax, clf, class_no):
    xx1, xx2 = np.meshgrid(np.arange(-5, 5,0, 0.1), np.arange(-5,5,0.1))
    probs = clf.predict_proba(np.stack((xx1.ravel(), xx2.ravel()), axis=1))
    z = probs[:,class_no]
    z = z.reshape(xx1.shape)
    CS = ax.contourf(xx1, xx2, z)
    cbar = plt.colorbar(CS)
    plt.xlabel("$X_1$")
    plt.ylabel("$X_2$")

In [69]:
plt.figure(figsize=(5,8))
ax = plt.subplot(211)
plot_probs(ax, clf, 0)
plt.title("Pred. prob for class 1")
ax = plt.subplot(212)
plot_probs(ax, clf, 1)
plot_probs(ax, clf, 1)
plt.title("Pred. prob for class 2")

<IPython.core.display.Javascript object>

TypeError: data type not understood