# Chapter 12: Introduction to Modeling Libraries in Python
## Load the Libraries


In [None]:
import numpy as np
import pandas as pd
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf

## 12.1 Interfacing Between pandas and Model Code

In [None]:
data = pd.DataFrame({
     'x0': [1, 2, 3, 4, 5],
     'x1': [0.01, -0.01, 0.25, -4.1, 0.],
     'y': [-1.5, 0., 3.6, 1.3, -2.]})
data

In [None]:
data.to_numpy()

In [None]:
df3 = data.copy()
df3['strings'] = ['a', 'b', 'c', 'd', 'e']
print(df3)
df3.to_numpy()

In [None]:
model_cols = ['x0', 'x1']
data.loc[:, model_cols].to_numpy()

In [None]:
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],
                                  categories=['a', 'b'])
data

In [None]:
dummies = pd.get_dummies(data.category,
                         prefix='category',
                         dtype=float)
data_with_dummies = data.drop('category', axis=1).join(dummies)
data_with_dummies

## 12.2 Creating Model Descritions with Patsy

In [None]:
data = pd.DataFrame({
    'x0': [1, 2, 3, 4, 5],
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2.]})

y, X = patsy.dmatrices('y ~ x0 + x1', data=data)

In [None]:
y

In [None]:
X

In [None]:
np.asarray(y)

In [None]:
np.asarray(X)

In [None]:
patsy.dmatrices('y ~ x0 + x1 + 0', data=data)[1]

In [None]:
coef, resid, _, _ = np.linalg.lstsq(X, y, rcond=None)
coef

In [None]:
coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)
coef

## Data Transformation with Patsy Formulas

In [None]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data=data)
X

In [None]:
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data = data)
X

In [None]:
new_data = pd.DataFrame({
    'x0': [6, 7, 8, 9],
    'x1': [3.1, -0.5, 0, 2.3],
    'y': [1, 2, 3, 4]})

new_X = patsy.build_design_matrices([X.design_info], new_data)

new_X

In [None]:
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data=data)
X

### Categorical Data and Patsy

In [None]:
data = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
    'key2': [0, 1, 0, 1, 0, 1, 0, 0],
    'v1': [1, 2, 3, 4, 5, 6, 7, 8],
    'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})

y, X = patsy.dmatrices('v2 ~ key1', data=data)
X

In [None]:
y, X = patsy.dmatrices('v2 ~ key1 + 0', data=data)
X

In [None]:
y, X = patsy.dmatrices('v2 ~ C(key2)', data=data)
X

In [None]:
data['key2'] = data.key2.map({0: 'zero', 1: 'one'})
data

In [None]:
y, X = patsy.dmatrices('v2 ~ key1 + key2', data=data)
X

In [None]:
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data=data)
X

## 12.3 Introduction to statsmodels

In [None]:
rng = np.random.default_rng(12345)

def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * rng.standard_normal(*size)


N = 100
X = np.c_[dnorm(0, 0.4, size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [None]:
X[:5]

In [None]:
y[:5]

In [None]:
X_model = sm.add_constant(X)
X_model[:5]

In [None]:
model = sm.OLS(y, X)
results = model.fit()

In [None]:
results.params

In [None]:
print(results.summary())

In [None]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
data[:5]

In [None]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
results.params

In [None]:
results.tvalues

In [None]:
results.predict(data[:5])

### Estimating Time Series Processes

In [None]:
init_x = 4
values = [init_x, init_x]
N = 1000

b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

In [None]:
from statsmodels.tsa.ar_model import AutoReg
MAXLAGS = 5
model = AutoReg(values, lags=MAXLAGS)
results = model.fit()
results.params

## 12.4 Introduction to scikit-learn

In [None]:
train = pd.read_csv(r"F:\books\pydata-book-3rd-edition\datasets\titanic\train.csv")
test = pd.read_csv(r"F:\books\pydata-book-3rd-edition\datasets\titanic\test.csv")
train.head()

In [None]:
impute_value = train.Age.median()
train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)

In [None]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

In [None]:
predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].to_numpy()
X_test = test[predictors].to_numpy()
y_train = train['Survived'].to_numpy()

X_train[:5]

In [None]:
y_train[:5]

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [103]:
y_predict = model.predict(X_test)
y_predict[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [106]:
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(Cs=5, max_iter=1000)
model_cv.fit(X_train, y_train)


In [107]:
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=10)
scores = cross_val_score(model, X_train, y_train, cv=5)
print(scores)

[0.7877095  0.78651685 0.79213483 0.7752809  0.79213483]
