## Alternatively

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load the credit data.
df = pd.read_csv('credit.csv')
df.head()

In [None]:
# The response variable will be 'Balance.'
x = df.drop('Balance', axis=1)
y = df['Balance']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Inspect the data types of the DataFrame's columns.
num = []
for col in df.columns[:-1]:
    if pd.api.types.is_numeric_dtype(df[col]):
        num.append(col)
num
# df.dtypes

In [None]:
### edTest(test_model1) ###
# Fit a linear model using only the numeric features in the dataframe.
numeric_features = num
model1 = LinearRegression().fit(x_train[numeric_features], y_train)

# Report train and test R2 scores.
train_score = model1.score(x_train[numeric_features], y_train)
test_score = model1.score(x_test[numeric_features], y_test)
print('Train R2:', train_score)
print('Test R2:', test_score)

In [None]:
### edTest(test_design) ###
# Create x train and test design matrices creating dummy variables for the categorical.
x_train_design = x_train
x_test_design = x_test
from sklearn.preprocessing import OneHotEncoder
for col in x_train_design:
    if pd.api.types.is_object_dtype(x_train_design[col]):
        enc = OneHotEncoder(drop='first')
        onehotarray = enc.fit_transform(x_train_design[[col]].values).toarray()
        name = [f'{col}_{name}' for name in enc.categories_[0][1:]]
        x_train_design[name] = onehotarray
        del x_train_design[col]

for col in x_test_design:
    if pd.api.types.is_object_dtype(x_test_design[col]):
        enc = OneHotEncoder(drop='first')
        onehotarray = enc.fit_transform(x_test_design[[col]].values).toarray()
        name = [f'{col}_{name}' for name in enc.categories_[0][1:]]
        x_test_design[name] = onehotarray
        del x_test_design[col]

# x_train_design = ___
# x_test_design = ___
x_train_design.head()

In [None]:
# Confirm that all data types are now numeric.
x_train_design.dtypes

In [None]:
### edTest(test_model2) ###
# Fit model2 on design matrix
model2 = LinearRegression().fit(x_train_design, y_train)

# Report train and test R2 scores
train_score = model2.score(x_train_design, y_train)
test_score = model2.score(x_test_design, y_test)
print('Train R2:', train_score)
print('Test R2:', test_score)

In [None]:
# Note that the intercept is not a part of .coef_ but is instead stored in .intercept_.
coefs = pd.DataFrame(model2.coef_, index=x_train_design.columns, columns=['beta_value'])
coefs

In [None]:
# Visualize crude measure of feature importance.
sns.barplot(data=coefs.T, orient='h').set(title='Model Coefficients');

Fit a model to predict Balance from 2 predictors: Income and Student

In [None]:
### edTest(test_model3) ###
# Specify best categorical feature
best_cat_feature = 'Student'

# Define the model.
features = ['Income', best_cat_feature]
model3 = LinearRegression()
model3.fit(x_train_design[features], y_train)

# Collect betas from fitted model.
beta0 = model3.intercept_
beta1 = model3.coef_[features.index('Income')]
beta2 = model3.coef_[features.index(best_cat_feature)]

# Display betas in a DataFrame.
coefs = pd.DataFrame([beta0, beta1, beta2], index=['Intercept']+features, columns=['beta_value'])
coefs

In [None]:
# Visualize crude measure of feature importance.
sns.barplot(data=coefs.T, orient='h').set(title='Model Coefficients');