In [9]:
import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model

In [10]:
data = sns.load_dataset("tips")
two_features = data[["total_bill", "size"]]
tip = data["tip"]

f2 = linear_model.LinearRegression(fit_intercept=False)
f2.fit(two_features, tip)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [11]:
f2.coef_

array([0.1007119 , 0.36209717])

In [12]:
f2.predict([[10, 3]])

array([2.09341054])

In [13]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib widget
fig=plt.figure()
ax=fig.add_subplot(111, projection='3d')
ax.scatter(data['total_bill'], data['size'], data['tip'])
plt.xlabel('total bill')
plt.ylabel('size')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'size')

In [14]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data["total_bill"], data["size"], data["tip"])
xx, yy = np.meshgrid(range(50), range(6))
zz = ( 0.1007119 * xx + 0.3621 * yy)
ax.plot_surface(xx, yy, zz, alpha=0.2)
plt.xlabel('total bill')
plt.ylabel('size')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'size')

### Computing the MSE for two vs. one feature

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mean_squared_error(data['tip'], f2.predict(two_features))

1.06482122862577

What this tells us: Our model has an MSE of 1.06.

Let's recompute the mean squared error for our model that only uses the total bill as its only feature.

In [16]:
one_feature = data[["total_bill"]]
tip = data["tip"]

f1 = linear_model.LinearRegression(fit_intercept=False)
f1.fit(one_feature, tip)

mean_squared_error(data['tip'], f1.predict(one_feature.values.reshape(-1, 1)))

1.1781161154513171

As we can see, the MSE is worse! Our new model that also uses the party size is fundamentally better. Note: We haven't talked about a very important idea known as "overfitting". We will discuss in a future lecture.

### Using Non-numeric Features

Suppose we also want to use the information about what day of the week that a table was served. To do this, we need to somehow convert the day into a numeric feature.

In [17]:
data.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
193,15.48,2.02,Male,Yes,Thur,Lunch,2
90,28.97,3.0,Male,Yes,Fri,Dinner,2
25,17.81,2.34,Male,No,Sat,Dinner,4
26,13.37,2.0,Male,No,Sat,Dinner,2
190,15.69,1.5,Male,Yes,Sun,Dinner,2


We can achieve our goal by using the pd.get_dummies function.

In [18]:
three_features=['total_bill', 'size', 'day']
three_features_data=pd.DataFrame(data[three_features])

dummies=pd.get_dummies(three_features_data['day'])
dummies.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,Thur,Fri,Sat,Sun
193,1,0,0,0
90,0,1,0,0
25,0,0,1,0
26,0,0,1,0
190,0,0,0,1


In [19]:
three_features = ['total_bill', 'size', 'day']

three_feature_data = pd.DataFrame(data[three_features])

dummies = pd.get_dummies(three_feature_data['day'])
data_w_dummies = pd.concat([three_feature_data, dummies], axis=1)
data_w_dummies.iloc[[193, 90, 25, 26, 190], :]

Unnamed: 0,total_bill,size,day,Thur,Fri,Sat,Sun
193,15.48,2,Thur,1,0,0,0
90,28.97,2,Fri,0,1,0,0
25,17.81,4,Sat,0,0,1,0
26,13.37,2,Sat,0,0,1,0
190,15.69,2,Sun,0,0,0,1


In [20]:
data_w_dummies = data_w_dummies.drop(['day'], axis=1)
data_w_dummies.head(5)

Unnamed: 0,total_bill,size,Thur,Fri,Sat,Sun
0,16.99,2,0,0,0,1
1,10.34,3,0,0,0,1
2,21.01,3,0,0,0,1
3,23.68,2,0,0,0,1
4,24.59,4,0,0,0,1


In [21]:
# Now that we've dropped the non-numeric data, we can fit our model.
f_with_day = linear_model.LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies, tip)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [22]:
f_with_day.coef_

array([0.09299361, 0.18713231, 0.66829361, 0.74578683, 0.62112858,
       0.73228865])

In [23]:
## Prediction for a party of 3, with a $50 total bill, eating on a thursday
f_with_day.predict([[50, 3, 1, 0, 0, 0]])

array([5.87937107])

In [24]:
## Prediction for a party of 3 with a $50 total bill, eating on a saturday
f_with_day.predict([[50, 3, 0, 0, 1, 0]])

array([5.83220605])

In [25]:
## Let's compute the MSE
mean_squared_error(data['tip'], f_with_day.predict(data_w_dummies))

1.0121298853078435

Above, we see the MSE is even lower!

### Using Non-Numeric Features using Seaborn

In [26]:
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=data)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<seaborn.axisgrid.FacetGrid at 0x1b061f1c348>

See the seaborn tutorial for more: https://seaborn.pydata.org/generated/seaborn.lmplot.html