In [4]:
import numpy as np
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import great_tables as tg
import altair as alt

In [7]:
dataset_path = r'50_Startups.csv'

In [8]:
dataset = pl.read_csv(dataset_path)

In [9]:
dataset

R&D Spend,Administration,Marketing Spend,State,Profit
f64,f64,f64,str,f64
165349.2,136897.8,471784.1,"""New York""",192261.83
162597.7,151377.59,443898.53,"""California""",191792.06
153441.51,101145.55,407934.54,"""Florida""",191050.39
144372.41,118671.85,383199.62,"""New York""",182901.99
142107.34,91391.77,366168.42,"""Florida""",166187.94
…,…,…,…,…
1000.23,124153.04,1903.93,"""New York""",64926.08
1315.46,115816.21,297114.46,"""Florida""",49490.75
0.0,135426.92,0.0,"""California""",42559.73
542.05,51743.15,0.0,"""New York""",35673.41


In [14]:
X = dataset.select(cs.exclude('Profit'))

In [15]:
y = dataset.select('Profit').to_series()

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [26]:
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['State'])],
    remainder='passthrough'
)

In [27]:
X_encoded = column_transformer.fit_transform(X)

In [32]:
feature_names = column_transformer.get_feature_names_out()

In [36]:
X_encoded = pl.DataFrame(X_encoded, schema=list(feature_names))

In [37]:
X_encoded

encoder__State_California,encoder__State_Florida,encoder__State_New York,remainder__R&D Spend,remainder__Administration,remainder__Marketing Spend
f64,f64,f64,f64,f64,f64
0.0,0.0,1.0,165349.2,136897.8,471784.1
1.0,0.0,0.0,162597.7,151377.59,443898.53
0.0,1.0,0.0,153441.51,101145.55,407934.54
0.0,0.0,1.0,144372.41,118671.85,383199.62
0.0,1.0,0.0,142107.34,91391.77,366168.42
…,…,…,…,…,…
0.0,0.0,1.0,1000.23,124153.04,1903.93
0.0,1.0,0.0,1315.46,115816.21,297114.46
1.0,0.0,0.0,0.0,135426.92,0.0
0.0,0.0,1.0,542.05,51743.15,0.0


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.2, random_state=42)

In [44]:
from sklearn.linear_model import LinearRegression

In [45]:
linear_model = LinearRegression(n_jobs=-1)

In [46]:
linear_model.fit(X_train, y_train)

In [47]:
predictions = linear_model.predict(X_test)

In [66]:
pl.DataFrame({
    'Actual': y_test,
    'Predicted': predictions,
    'Error': y_test - predictions,
},

)

Actual,Predicted,Error
f64,f64,f64
134307.35,126362.879083,7944.470917
81005.76,84608.453836,-3602.693836
99937.59,99677.494252,260.095748
64926.08,46357.460686,18568.619314
125370.37,128750.482885,-3380.112885
35673.41,50912.417419,-15239.007419
105733.54,109741.350327,-4007.810327
107404.34,100643.242816,6761.097184
97427.84,97599.275746,-171.435746
122776.86,113097.425244,9679.434756


In [57]:
dataset

R&D Spend,Administration,Marketing Spend,State,Profit
f64,f64,f64,str,f64
165349.2,136897.8,471784.1,"""New York""",192261.83
162597.7,151377.59,443898.53,"""California""",191792.06
153441.51,101145.55,407934.54,"""Florida""",191050.39
144372.41,118671.85,383199.62,"""New York""",182901.99
142107.34,91391.77,366168.42,"""Florida""",166187.94
…,…,…,…,…
1000.23,124153.04,1903.93,"""New York""",64926.08
1315.46,115816.21,297114.46,"""Florida""",49490.75
0.0,135426.92,0.0,"""California""",42559.73
542.05,51743.15,0.0,"""New York""",35673.41
