In [12]:
import numpy as np
import pandas as pd
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import pickle

In [13]:
data = load_dataset("tips")

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [15]:
data.drop_duplicates()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [16]:
def convert(x):
  if x == "Female" or x == "No":
    return 0
  if x == "Male" or x == "Yes":
    return 1
  return x

data = data.map(convert)
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,Sun,Dinner,2
1,10.34,1.66,1,0,Sun,Dinner,3
2,21.01,3.50,1,0,Sun,Dinner,3
3,23.68,3.31,1,0,Sun,Dinner,2
4,24.59,3.61,0,0,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,Sat,Dinner,3
240,27.18,2.00,0,1,Sat,Dinner,2
241,22.67,2.00,1,1,Sat,Dinner,2
242,17.82,1.75,1,0,Sat,Dinner,2


In [17]:
data = pd.get_dummies(data, columns=["day", 'time'])

In [18]:
data

Unnamed: 0,total_bill,tip,sex,smoker,size,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,0,0,2,False,False,False,True,False,True
1,10.34,1.66,1,0,3,False,False,False,True,False,True
2,21.01,3.50,1,0,3,False,False,False,True,False,True
3,23.68,3.31,1,0,2,False,False,False,True,False,True
4,24.59,3.61,0,0,4,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,3,False,False,True,False,False,True
240,27.18,2.00,0,1,2,False,False,True,False,False,True
241,22.67,2.00,1,1,2,False,False,True,False,False,True
242,17.82,1.75,1,0,2,False,False,True,False,False,True


In [19]:
y = data["tip"]
x = data.drop("tip", axis=1)

x_test, x_train, y_test, y_train, = train_test_split(x, y, train_size=.1, random_state=5432)

y_test.shape, x_test.shape, y_train.shape, x_train.shape

((24,), (24, 10), (220,), (220, 10))

In [20]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(x_train[["total_bill"]])
x_train["total_bill"] = min_max_scaler.transform(x_train[["total_bill"]])
x_test["total_bill"] = min_max_scaler.transform(x_test[["total_bill"]])


In [21]:
x_train["total_bill"], x_test["total_bill"]

(74     0.244240
 110    0.228948
 80     0.342899
 160    0.386049
 66     0.280268
          ...   
 43     0.138458
 227    0.364055
 118    0.196062
 92     0.056137
 72     0.498324
 Name: total_bill, Length: 220, dtype: float64,
 32     0.251152
 85     0.665270
 100    0.173439
 125    0.559908
 40     0.271680
 218    0.097822
 126    0.114160
 242    0.308965
 223    0.270423
 166    0.370549
 13     0.321743
 54     0.471093
 69     0.250105
 202    0.208002
 19     0.368245
 97     0.187683
 161    0.200880
 3      0.431713
 63     0.318810
 59     0.946795
 203    0.279221
 89     0.378928
 17     0.276917
 64     0.304147
 Name: total_bill, dtype: float64)

In [22]:
model = LinearRegression()
model.fit(x_train, y_train)
print(model.score(x_test, y_test))
model

0.8310643538324443


In [24]:
fileModel = open('model.pkl','wb')
pickle.dump(model, fileModel)

fileScaler =open('scaler.pkl','wb')
pickle.dump(min_max_scaler, fileScaler)