In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({"x0": [1, 2, 3, 4, 5], 
                    "x1": [0.01, -0.01, 0.25, -4.1, 0], 
                    "y": [-1.5, 0, 3.6, 1.3, -2]})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [3]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [4]:
data.to_numpy

<bound method DataFrame.to_numpy of    x0    x1    y
0   1  0.01 -1.5
1   2 -0.01  0.0
2   3  0.25  3.6
3   4 -4.10  1.3
4   5  0.00 -2.0>

In [7]:
pd.DataFrame(data.to_numpy(), 
            columns=["one", "two", "three"])

Unnamed: 0,one,two,three
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [8]:
df3 = data.copy()
df3["string"] = list("abcde")
df3

Unnamed: 0,x0,x1,y,string
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,c
3,4,-4.1,1.3,d
4,5,0.0,-2.0,e


In [9]:
df3.to_numpy()

array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [10]:
data["category"] = pd.Categorical(["a", "b", "a", "a", "b"], 
                                 categories=["a", "b"])
data

Unnamed: 0,x0,x1,y,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [11]:
dummies = pd.get_dummies(data.category, prefix = "category")
dummies

Unnamed: 0,category_a,category_b
0,True,False
1,False,True
2,True,False
3,True,False
4,False,True


In [12]:
data_with_dummies = data.drop("category", axis = 1).join(dummies)
data_with_dummies

Unnamed: 0,x0,x1,y,category_a,category_b
0,1,0.01,-1.5,True,False
1,2,-0.01,0.0,False,True
2,3,0.25,3.6,True,False
3,4,-4.1,1.3,True,False
4,5,0.0,-2.0,False,True


## Creating Model Descriptions with Patsy

In [16]:
data = pd.DataFrame({
    "x0": [1, 2, 3, 4, 5],
    "x1": [0.01, -0.01, 0.25, -4.1, 0],
    "y": [-1.5, 0, 3.6, 1.3, -2]
})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [17]:
import patsy

In [19]:
y, X = patsy.dmatrices("y ~ x0 + x1", data)

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


In [20]:
y

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [21]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [22]:
np.asarray(y)

array([[-1.5],
       [ 0. ],
       [ 3.6],
       [ 1.3],
       [-2. ]])

In [23]:
np.array(X)

array([[ 1.  ,  1.  ,  0.01],
       [ 1.  ,  2.  , -0.01],
       [ 1.  ,  3.  ,  0.25],
       [ 1.  ,  4.  , -4.1 ],
       [ 1.  ,  5.  ,  0.  ]])

In [24]:
coef, resid, _, _ = np.linalg.lstsq(X, y)

  coef, resid, _, _ = np.linalg.lstsq(X, y)


In [25]:
coef

array([[ 0.31290976],
       [-0.07910564],
       [-0.26546384]])

In [26]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [28]:
rng = np.random.default_rng(seed = 12345)


def dnorm(mean, variance, size = 1):
    if isinstance(size, int):
        size = size
    return mean + np.sqrt(variance) * rng.standard_normal(size)

N = 100
X = np.c_[dnorm(0, 0.4, size = N), 
         dnorm(0, 0.6, size = N), 
         dnorm(0, 0.2, size = N)]

eps = dnorm(0, 0.1, size = N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [30]:
len(X)

100

In [32]:
X[:5]

array([[-0.90050602, -0.18942958, -1.0278702 ],
       [ 0.79925205, -1.54598388, -0.32739708],
       [-0.55065483, -0.12025429,  0.32935899],
       [-0.16391555,  0.82403985,  0.20827485],
       [-0.04765129, -0.21314698, -0.04824364]])

In [34]:
X_model = sm.add_constant(X)
X_model[:5]

array([[ 1.        , -0.90050602, -0.18942958, -1.0278702 ],
       [ 1.        ,  0.79925205, -1.54598388, -0.32739708],
       [ 1.        , -0.55065483, -0.12025429,  0.32935899],
       [ 1.        , -0.16391555,  0.82403985,  0.20827485],
       [ 1.        , -0.04765129, -0.21314698, -0.04824364]])

In [35]:
model = sm.OLS(y, X)

In [36]:
results = model.fit()
results

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f0d858a9090>

In [37]:
results.params

array([0.06681503, 0.26803235, 0.45052319])

In [38]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.469
Model:,OLS,Adj. R-squared (uncentered):,0.452
Method:,Least Squares,F-statistic:,28.51
Date:,"Sun, 29 Oct 2023",Prob (F-statistic):,2.66e-13
Time:,14:53:54,Log-Likelihood:,-25.611
No. Observations:,100,AIC:,57.22
Df Residuals:,97,BIC:,65.04
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0668,0.054,1.243,0.217,-0.040,0.174
x2,0.2680,0.042,6.313,0.000,0.184,0.352
x3,0.4505,0.068,6.605,0.000,0.315,0.586

0,1,2,3
Omnibus:,0.435,Durbin-Watson:,1.869
Prob(Omnibus):,0.805,Jarque-Bera (JB):,0.301
Skew:,0.134,Prob(JB):,0.86
Kurtosis:,2.995,Cond. No.,1.64


In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../../github/pydata-book/datasets/titanic/train.csv")
test = pd.read_csv("../../github/pydata-book/datasets/titanic/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
impute_value = train["Age"].median()
train["Age"] = train["Age"].fillna(impute_value)
test["Age"] = test["Age"].fillna(impute_value)

In [12]:
train["IsFemale"] = (train["Sex"] == "female").astype(int)
test["IsFemale"] = (train["Sex"] == "female").astype(int)

In [17]:
predictors = ["Pclass", "IsFemale", "Age"]
X_train = train[predictors].to_numpy()
X_test = test[predictors].to_numpy()

In [27]:
y_train = train["Survived"].to_numpy()
y_test = test["Survived"].to_numpy()
X_train[:5]

KeyError: 'Survived'

In [19]:
y_train[:5]

array([0, 1, 1, 1, 0])

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train, y_train)

In [24]:
y_predict = model.predict(X_test)
y_predict[:5]

array([0, 0, 1, 1, 0])