In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
tips = pd.get_dummies(df[['sex', 'smoker', 'time']])
tips.head()

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner
0,0,1,0,1,0,1
1,1,0,0,1,0,1
2,1,0,0,1,0,1
3,1,0,0,1,0,1
4,0,1,0,1,0,1


In [None]:
pip install --upgrade category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [None]:
from category_encoders import BinaryEncoder

In [None]:
binary_encoder = BinaryEncoder(cols = ['day'])
df_enc = binary_encoder.fit_transform(df['day'])

In [None]:
df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [None]:
df_enc

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [None]:
tips = pd.concat([df_enc, tips], axis = 1)
tips.head()

Unnamed: 0,day_0,day_1,day_2,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner
0,0,0,1,0,1,0,1,0,1
1,0,0,1,1,0,0,1,0,1
2,0,0,1,1,0,0,1,0,1
3,0,0,1,1,0,0,1,0,1
4,0,0,1,0,1,0,1,0,1


In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
robust = RobustScaler()

In [None]:
tips['total_bill'] = robust.fit_transform(df[['total_bill']])

In [None]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
tips['tip'] = df['tip']
tips.head()

Unnamed: 0,day_0,day_1,day_2,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner,total_bill,tip
0,0,0,1,0,1,0,1,0,1,-0.074675,1.01
1,0,0,1,1,0,0,1,0,1,-0.691558,1.66
2,0,0,1,1,0,0,1,0,1,0.298237,3.5
3,0,0,1,1,0,0,1,0,1,0.545918,3.31
4,0,0,1,0,1,0,1,0,1,0.630334,3.61


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
X = tips.drop(columns = 'tip')
y = tips['tip']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size = 0.3)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)

1.0701384159269316

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
df = pd.DataFrame({
    'x1':[4,5,np.nan,6,7,9],
    'x2':[3,5,6,5,np.nan,5],
    'x3':[10,11,12,9,8,11],
    'x4':['A','A','C','C','D',np.nan],
    'x5':['X','Y','X','X',np.nan,'Y'],
    'x6':['M','M',np.nan,'M','N',np.nan]
    })

In [None]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10,A,X,M
1,5.0,5.0,11,A,Y,M
2,,6.0,12,C,X,
3,6.0,5.0,9,C,X,M
4,7.0,,8,D,,N
5,9.0,5.0,11,,Y,


In [None]:
imp = SimpleImputer(strategy = 'median')
df[['x1', 'x2', 'x3']] = imp.fit_transform(df[['x1', 'x2', 'x3']])
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,,N


In [None]:
imp_mode = SimpleImputer(strategy = 'most_frequent')
df[['x4', 'x5']] = imp_mode.fit_transform(df[['x4', 'x5']])
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,X,N


In [None]:
imp_constant = SimpleImputer(strategy = 'constant', fill_value = 'N')
df[['x6']] = imp_constant.fit_transform(df[['x6']])
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,N
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,X,N


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11.0],
    'x4':['A','A','C','C','D','D']
    })

In [None]:
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,,6.3,,C
3,6.3,4.9,8.9,C
4,7.4,,9.1,D
5,9.1,5.4,11.0,D


In [None]:
imp_iter = IterativeImputer()
df[['x1', 'x2', 'x3']] = imp_iter.fit_transform(df[['x1', 'x2', 'x3']])
df.head()

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.18363,6.3,9.823389,C
3,6.3,4.9,8.9,C
4,7.4,5.073866,9.1,D


In [None]:
from sklearn.impute import KNNImputer

In [None]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11.0],
    'x4':['A','A','C','C','D','D']
    })

In [None]:
knn_iter = IterativeImputer()
df[['x1', 'x2', 'x3']] = knn_iter.fit_transform(df[['x1', 'x2', 'x3']])
df.head()

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.18363,6.3,9.823389,C
3,6.3,4.9,8.9,C
4,7.4,5.073866,9.1,D
