# Linear Regression

In [1]:
import pandas as pd

df = pd.read_csv('tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
from sklearn.model_selection import train_test_split

X = df[['total_bill']]
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [3]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

# coefficient of determination R^2    best is 1.0
(lm.score(X_train, y_train), lm.score(X_test, y_test))  

(0.4150096506457663, 0.5906895098589039)

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

y_pred = lm.predict(X_test)
('MAE', mean_absolute_error(y_test, y_pred), 'RMSE', mean_squared_error(y_test, y_pred) ** 0.5)

('MAE', 0.6564074900962108, 'RMSE', 0.9062610353957787)

In [5]:
('Coefficients', lm.coef_, 'Intercept', lm.intercept_)

('Coefficients', array([0.0968534]), 'Intercept', 1.0285439454607277)

# Polynomial

Transform $x$ to $x^0$, $x^1$, $x^2$, etc.

In [6]:
from sklearn.preprocessing import PolynomialFeatures

X = PolynomialFeatures(3).fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

Use the same LinearRegression

In [7]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
(lm.score(X_train, y_train), lm.score(X_test, y_test))

(0.4249071570176152, 0.629047965264137)

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

y_pred = lm.predict(X_test)
('MAE', mean_absolute_error(y_test, y_pred), 'RMSE', mean_squared_error(y_test, y_pred) ** 0.5)

('MAE', 0.6242949938156378, 'RMSE', 0.8627515518279183)

# One Hot Encoding

In [9]:
pd.get_dummies(df['sex'])

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
5,0,1
6,0,1
7,0,1
8,0,1
9,0,1


In [10]:
df1 = pd.concat([df, _], axis=1)
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Female,Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,0,1
3,23.68,3.31,Male,No,Sun,Dinner,2,0,1
4,24.59,3.61,Female,No,Sun,Dinner,4,1,0


In [11]:
X = df1[['total_bill', 'Male', 'Female']]
y = df1['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

(lm.score(X_train, y_train), lm.score(X_test, y_test))

(0.41508770106168047, 0.5907712610956712)

In [12]:
y_pred = lm.predict(X_test)
('MAE', mean_absolute_error(y_test, y_pred), 'RMSE', mean_squared_error(y_test, y_pred) ** 0.5)

('MAE', 0.6574086675031693, 'RMSE', 0.9061705275026107)

# Conclusion

$R^2$ score
- (0.4150, 0.5907) linear
- (0.4249, 0.6290) poly
- (0.4151, 0.5908) onehot

Error
- ('MAE', 0.6564, 'RMSE', 0.9063) linear
- ('MAE', 0.6243, 'RMSE', 0.8628) poly
- ('MAE', 0.6574, 'RMSE', 0.9062) onehot