In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import statsmodels.formula.api as smf

#### Example dataframe

In [9]:
df = pd.DataFrame(columns=["cat", "x", "y"])
df["x"] = np.random.normal(size=10)
df["y"] = np.random.normal(size=10)
df["cat"] = ["a", "b", "a", "c", "b", "c", "a", "c", "b", "b"]
df

Unnamed: 0,cat,x,y
0,a,0.80451,0.992378
1,b,-2.755097,-1.441087
2,a,1.913863,0.51515
3,c,-0.369528,0.793434
4,b,-0.168803,1.338463
5,c,-0.395155,0.318126
6,a,0.012188,0.49536
7,c,1.359083,0.090747
8,b,-0.456275,0.300795
9,b,-0.003572,0.182514


#### Method 1: Pandas get dummies

In [10]:
d2 = df.join(pd.get_dummies(df.cat))

model = LinearRegression(fit_intercept=False)

model.fit(
    X=d2[["a", "b", "c", "x"]],
    y=d2["y"]
)
model.coef_

array([0.32035272, 0.41793367, 0.32517241, 0.38154429])

#### Method 2: One hot encoder

In [16]:
transform = make_column_transformer(
    (OneHotEncoder(), ["cat"]),
    remainder="passthrough"
)

model = LinearRegression(fit_intercept=False)
pipe = make_pipeline(transform, model)

pipe.fit(
    X=df[["cat", "x"]], 
    y=df["y"]
)
model.coef_

array([0.32035272, 0.41793367, 0.32517241, 0.38154429])

## Get dummies with polynomial features

In [12]:
model = LinearRegression(fit_intercept=False)
poly = PolynomialFeatures(degree=2)
pipe = make_pipeline(poly, model)

pipe.fit(
    X=d2[["a", "b", "c", "x"]],
    y=d2["y"]
)
model.coef_

array([ 0.35126888,  0.08845413,  0.10029103,  0.16252372,  0.13321931,
        0.08845413,  0.        ,  0.        ,  0.69791192,  0.10029103,
        0.        , -0.58679957,  0.16252372,  0.02210696, -0.42946344])

## One hot encoder with polynomial features

In [19]:
transform = make_column_transformer(
    (OneHotEncoder(), ["cat"]),
    remainder="passthrough"
)
model = LinearRegression(fit_intercept=False)
poly = PolynomialFeatures(degree=2)
pipe = make_pipeline(transform, poly, model)
pipe.fit(
    X=df[["cat", "x"]], 
    y=df["y"]
)
model.coef_

array([ 0.35126888,  0.08845413,  0.10029103,  0.16252372,  0.13321931,
        0.08845413,  0.        ,  0.        ,  0.69791192,  0.10029103,
        0.        , -0.58679957,  0.16252372,  0.02210696, -0.42946344])