In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

In [None]:
# Load the example dataset for Anscombe's quartet
df = sns.load_dataset("anscombe")

In [None]:
df

In [None]:
fig, ax = plt.subplots(2,2,figsize=(8,6))
ax = ax.ravel()

for i,val in enumerate(df.dataset.unique()):
    
    x = df.loc[df.dataset == val, 'x']
    y = df.loc[df.dataset == val, 'y']
    
    ax[i].plot(x, y, 'ro')
    ax[i].set_xlim([0,20])
    ax[i].set_ylim([0,15])

In [None]:
# Show the results of a linear regression within each dataset
sns.lmplot(data=df, x="x", y="y",
           col="dataset", hue="dataset");

In [None]:
# Show the results of a linear regression within each dataset
sns.lmplot(data=df, x="x", y="y",
           col="dataset", hue="dataset",
           col_wrap=2, ci=None, 
           palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1})

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
# the data we want to model, in column vector form
x = df.loc[df.dataset == 'I', 'x'].values.reshape(-1, 1)
y = df.loc[df.dataset == 'I', 'y'].values.reshape(-1, 1)

# modeling a best-fit line
model = LinearRegression()
model.fit(x, y)

# make new data for a line
x_new = np.linspace(0, 20, 100).reshape(-1, 1)
y_new = model.predict(x_new)

# plot the results
fig, ax = plt.subplots(figsize=(4, 3))
ax.scatter(x, y)
ax.plot(x_new, y_new)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_xlim([0,20])
ax.set_ylim([0,15])

plt.show()

In [None]:
fig, ax = plt.subplots(2,2,figsize=(8, 6))
ax = ax.ravel()

for i,val in enumerate(df.dataset.unique()):

    x = df.loc[df.dataset == val, 'x'].values.reshape(-1, 1)
    y = df.loc[df.dataset == val, 'y'].values.reshape(-1, 1)

    # modeling a best-fit line
    model = LinearRegression()
    model.fit(x, y)

    # make new data for a line
    x_new = np.linspace(0, 20, 100).reshape(-1, 1)
    y_new = model.predict(x_new)

    # plot the results
    ax[i].scatter(x, y)
    ax[i].plot(x_new, y_new)
    ax[i].set_xlabel('x')
    ax[i].set_ylabel('y')
    ax[i].set_xlim([0,20])
    ax[i].set_ylim([0,15])

plt.tight_layout()
plt.show()

In [None]:
import ipywidgets
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def findline(dataset='I'):
    
    fig, ax = plt.subplots(figsize=(5, 5))

    x = df.loc[df.dataset == dataset, 'x'].values.reshape(-1, 1)
    y = df.loc[df.dataset == dataset, 'y'].values.reshape(-1, 1)

    if dataset == 'IV':
        y[:] = 3 + 0.5*x[:]
        
    # modeling a best-fit line
    model = LinearRegression()
    model.fit(x, y)

    # make new data for a line
    x_new = np.linspace(0, 20, 100).reshape(-1, 1)
    y_new = model.predict(x_new)

    y_pred = model.predict(x)
    print('The coefficients of the line are: %.2f, %.2f' % (model.coef_, model.intercept_))
    print('Mean squared error: %.2f' % mean_squared_error(y, y_pred))
    print('R^2: %.2f' % r2_score(y, y_pred))
    
    # plot the results
    ax.scatter(x, y)
    ax.plot(x_new, y_new)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_xlim([0,20])
    ax.set_ylim([0,15])

ipywidgets.interactive(findline, dataset=df.dataset.unique())

In [None]:
from IPython.display import clear_output
out = ipywidgets.Output()

def findline(dataset='I'):
    
    with out:
        clear_output(wait=True)
    
        #fig, ax = plt.subplots(figsize=(5, 5))

        x = df.loc[df.dataset == dataset, 'x'].values.reshape(-1, 1)
        y = df.loc[df.dataset == dataset, 'y'].values.reshape(-1, 1)

        if dataset == 'IV':
            y[:] = 3 + 0.5*x[:]

        # modeling a best-fit line
        model = LinearRegression()
        model.fit(x, y)

        # make new data for a line
        x_new = np.linspace(0, 20, 100).reshape(-1, 1)
        y_new = model.predict(x_new)

        y_pred = model.predict(x)
        print('The coefficients of the line are: %.2f, %.2f' % (model.coef_, model.intercept_))
        print('Mean squared error: %.2f' % mean_squared_error(y, y_pred))
        print('R^2: %.2f' % r2_score(y, y_pred))

        # plot the results
    #     ax.scatter(x, y)
    #     ax.plot(x_new, y_new)
    #     ax.set_xlabel('x')
    #     ax.set_ylabel('y')

        x = sns.jointplot(data=df[df.dataset == dataset], x='x', y='y')
        ax = x.ax_joint
        ax.set_xlim([0,20])
        ax.set_ylim([0,15])

        ax.plot(x_new,y_new)
    
ipywidgets.interactive(findline, dataset=df.dataset.unique())

In [None]:
import pandas as pd

In [None]:
dinodf = pd.read_csv('data/DatasaurusDozen.tsv',sep='\t')

In [None]:
dinodf

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=dinodf[dinodf.dataset=='dino'], x='x', y='y');

In [None]:
out = ipywidgets.Output()
def snsdino(dataset='dino'):

    with out:
        clear_output(wait=True)

        g = sns.jointplot(data=dinodf[dinodf.dataset==dataset], x='x', y='y', marginal_kws=dict(bins=15))
        x = dinodf.loc[dinodf.dataset==dataset,'x'].values
        y = dinodf.loc[dinodf.dataset==dataset,'y'].values
        xmean = x.mean()
        ymean = y.mean()
        xstd = x.std()
        ystd = y.std()
        corr = np.corrcoef(x,y)[0,1]
        msg = '''
        X Mean: {:.2f}
        Y Mean: {:.2f}
        X StdDev: {:.2f}
        Y StdDev: {:.2f}
        Corr: {:.2f}'''.format(xmean,ymean,xstd,ystd,corr)

        ax = g.ax_joint
        plt.text(1.0, 0.5, msg, fontsize=14, transform=plt.gcf().transFigure)

ipywidgets.interactive(snsdino, dataset = dinodf.dataset.unique())