# Linear Regression

In [22]:
import pandas as pd

df = pd.read_csv('tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [23]:
from sklearn.model_selection import train_test_split

X = df[['total_bill']]
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [24]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
# The coefficients
print ('Coefficients: ', lm.coef_)
print ('Intercept: ', lm.intercept_)

Coefficients:  [0.0968534]
Intercept:  1.0285439454607277


In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)


0.6564074900962108
0.9062610353957787


# Normalized

In [44]:
X = df[['total_bill', 'size']]
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
print ('Coefficients: ', lm.coef_)
print ('Intercept: ', lm.intercept_)
y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)

Coefficients:  [0.0840736 0.2115783]
Intercept:  0.738754592796008
0.6424339446924844
0.9035287077964982


In [41]:
from sklearn.preprocessing import MinMaxScaler

X = [[1,2],
     [2,4],
     [3,5]]

MinMaxScaler().fit_transform(X)

array([[0.        , 0.        ],
       [0.5       , 0.66666667],
       [1.        , 1.        ]])

In [42]:
from sklearn.preprocessing import MinMaxScaler

X = df[['total_bill', 'size']]
y = df['tip']

mms = MinMaxScaler()
X = mms.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [37]:
from sklearn.preprocessing import StandardScaler

X = df[['total_bill', 'size']]
y = df['tip']

mms = StandardScaler()
X = mms.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [43]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
print ('Coefficients: ', lm.coef_)
print ('Intercept: ', lm.intercept_)
y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)

Coefficients:  [4.01367359 1.05789148]
Intercept:  1.2084388350190454
0.6424339446924846
0.9035287077964983


# One Hot Encoding

In [8]:
pd.get_dummies(df['sex'])

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
5,0,1
6,0,1
7,0,1
8,0,1
9,0,1


In [9]:
df1 = pd.concat([df, _], axis=1)
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Female,Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,0,1
3,23.68,3.31,Male,No,Sun,Dinner,2,0,1
4,24.59,3.61,Female,No,Sun,Dinner,4,1,0


In [10]:
X = df1[['total_bill', 'Male', 'Female']]
y = df1['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

print ('Coefficients: ', lm.coef_)
print ('Intercept: ',lm.intercept_)

Coefficients:  [ 0.0969831 -0.013015   0.013015 ]
Intercept:  1.0305293508731472


In [11]:
y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)

0.6574086675031693
0.9061705275026107


In [12]:
pd.get_dummies(df['time'])

Unnamed: 0,Dinner,Lunch
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [13]:
df1 = pd.concat([df, _], axis=1)
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Dinner,Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0
2,21.01,3.5,Male,No,Sun,Dinner,3,1,0
3,23.68,3.31,Male,No,Sun,Dinner,2,1,0
4,24.59,3.61,Female,No,Sun,Dinner,4,1,0


In [14]:
X = df1[['total_bill', 'Dinner', 'Lunch']]
y = df1['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

print ('Coefficients: ', lm.coef_)
print ('Intercept: ',lm.intercept_)

Coefficients:  [ 0.09772534 -0.05797379  0.05797379]
Intercept:  1.036796564556316


In [15]:
y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)

0.6736885206900818
0.9190476232264408


# Polynomial Regression

In [16]:
d_sex = pd.get_dummies(df.sex)
d_smoker = pd.get_dummies(df.smoker)
d_time = pd.get_dummies(df.time)

df1 = pd.concat([df, d_sex, d_smoker, d_time], axis=1)

df1 = df1.select_dtypes(include='number')
df1.head()

Unnamed: 0,total_bill,tip,size,Female,Male,No,Yes,Dinner,Lunch
0,16.99,1.01,2,1,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,1,0
2,21.01,3.5,3,0,1,1,0,1,0
3,23.68,3.31,2,0,1,1,0,1,0
4,24.59,3.61,4,1,0,1,0,1,0


In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

X = df1[df1.columns.difference(['tip'])]
y = df1.tip

X = MinMaxScaler().fit_transform(X)
X = PolynomialFeatures(5).fit_transform(X)
X[:5]

array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        9.91586152e-04, 1.44563042e-03, 2.10758017e-03],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        5.65035808e-04, 2.15113653e-04, 8.18954891e-05],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        8.49063272e-03, 7.97664175e-03, 7.49376587e-03],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        3.21844967e-03, 6.94724003e-03, 1.49960847e-02],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        3.29747915e-02, 2.47736878e-02, 1.86122665e-02]])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred) ** 0.5)

279479537.9735072
1036969557.2491044
