In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("csv_file/tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.shape

(244, 7)

In [4]:
df.isna().sum()

total_bill     0
tip            0
sex           28
smoker         0
day            0
time           0
size           0
dtype: int64

In [5]:
# Now features encoding
df2 = pd.get_dummies(data=df,columns=['smoker','day','time'],drop_first=True)
df2

Unnamed: 0,total_bill,tip,sex,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,Female,2,0,0,1,0,0
1,10.34,1.66,Male,3,0,0,1,0,0
2,21.01,3.50,Male,3,0,0,1,0,0
3,23.68,3.31,Male,2,0,0,1,0,0
4,24.59,3.61,Female,4,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,,3,0,1,0,0,0
240,27.18,2.00,Female,2,1,1,0,0,0
241,22.67,2.00,Male,2,1,1,0,0,0
242,17.82,1.75,Male,2,0,1,0,0,0


In [6]:
notNull = df2[df2['sex'].notna()]
notNull.head()

Unnamed: 0,total_bill,tip,sex,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,Female,2,0,0,1,0,0
1,10.34,1.66,Male,3,0,0,1,0,0
2,21.01,3.5,Male,3,0,0,1,0,0
3,23.68,3.31,Male,2,0,0,1,0,0
4,24.59,3.61,Female,4,0,0,1,0,0


In [7]:
notNull.shape

(216, 9)

In [8]:
notNull.isnull().sum()

total_bill    0
tip           0
sex           0
size          0
smoker_Yes    0
day_Sat       0
day_Sun       0
day_Thur      0
time_Lunch    0
dtype: int64

In [9]:
isNull = df2[df2['sex'].isnull()]
isNull.head()

Unnamed: 0,total_bill,tip,sex,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
14,14.83,3.02,,2,0,0,1,0,0
21,20.29,2.75,,2,0,1,0,0,0
25,17.81,2.34,,4,0,1,0,0,0
29,19.65,3.0,,2,0,1,0,0,0
105,15.36,1.64,,2,1,1,0,0,0


In [10]:
isNull.shape

(28, 9)

In [11]:
isNull.isnull().sum()

total_bill     0
tip            0
sex           28
size           0
smoker_Yes     0
day_Sat        0
day_Sun        0
day_Thur       0
time_Lunch     0
dtype: int64

### For NotNull datasets

In [12]:
# for notNull datasets
features = notNull.drop(columns=['sex'])
features

Unnamed: 0,total_bill,tip,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,2,0,0,1,0,0
1,10.34,1.66,3,0,0,1,0,0
2,21.01,3.50,3,0,0,1,0,0
3,23.68,3.31,2,0,0,1,0,0
4,24.59,3.61,4,0,0,1,0,0
...,...,...,...,...,...,...,...,...
238,35.83,4.67,3,0,1,0,0,0
240,27.18,2.00,2,1,1,0,0,0
241,22.67,2.00,2,1,1,0,0,0
242,17.82,1.75,2,0,1,0,0,0


In [13]:
features.shape

(216, 8)

In [14]:
features.isna().sum()

total_bill    0
tip           0
size          0
smoker_Yes    0
day_Sat       0
day_Sun       0
day_Thur      0
time_Lunch    0
dtype: int64

In [15]:
label = notNull.sex
label.head()

0    Female
1      Male
2      Male
3      Male
4    Female
Name: sex, dtype: object

In [16]:
label.shape

(216,)

In [17]:
label.isna().sum()

0

In [18]:
col_names = ['total_bill','tip','size']
my_features = features[col_names]
my_features.head()

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.5,3
3,23.68,3.31,2
4,24.59,3.61,4


In [19]:
my_features.shape

(216, 3)

In [20]:
scaler = MinMaxScaler()

In [21]:
features[col_names] = scaler.fit_transform(my_features.values)

In [22]:
features.head()

Unnamed: 0,total_bill,tip,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.291579,0.001111,0.2,0,0,1,0,0
1,0.152283,0.073333,0.4,0,0,1,0,0
2,0.375786,0.277778,0.4,0,0,1,0,0
3,0.431713,0.256667,0.2,0,0,1,0,0
4,0.450775,0.29,0.6,0,0,1,0,0


In [23]:
features.shape

(216, 8)

In [24]:
label.shape

(216,)

In [25]:
(features.isna().sum().sum(),label.isna().sum())

(0, 0)

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
knn_model = KNeighborsClassifier()

In [28]:
knn_model.fit(features,label)

KNeighborsClassifier()

### For null datasets

In [29]:
# for notNull datasets
features2 = isNull.drop(columns=['sex'])
features2

Unnamed: 0,total_bill,tip,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
14,14.83,3.02,2,0,0,1,0,0
21,20.29,2.75,2,0,1,0,0,0
25,17.81,2.34,4,0,1,0,0,0
29,19.65,3.0,2,0,1,0,0,0
105,15.36,1.64,2,1,1,0,0,0
106,20.49,4.06,2,1,1,0,0,0
107,25.21,4.29,2,1,1,0,0,0
108,18.24,3.76,2,0,1,0,0,0
109,14.31,4.0,2,1,1,0,0,0
110,14.0,3.0,2,0,1,0,0,0


In [30]:
features2.shape

(28, 8)

In [31]:
features2.isnull().sum().sum()

0

In [32]:
label2 = isNull.sex
label2.head()

14     NaN
21     NaN
25     NaN
29     NaN
105    NaN
Name: sex, dtype: object

In [34]:
label2.shape

(28,)

In [35]:
col_names2 = ['total_bill','tip','size']
my_features2 = features2[col_names2]
my_features2.head()

Unnamed: 0,total_bill,tip,size
14,14.83,3.02,2
21,20.29,2.75,2
25,17.81,2.34,4
29,19.65,3.0,2
105,15.36,1.64,2


In [36]:
my_features2.shape

(28, 3)

In [37]:
scaler2 = MinMaxScaler()

In [38]:
features2[col_names2] = scaler2.fit_transform(my_features2.values)

In [39]:
features2.head()

Unnamed: 0,total_bill,tip,size,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
14,0.223335,0.354386,0.2,0,0,1,0,0
21,0.384207,0.307018,0.2,0,1,0,0,0
25,0.311137,0.235088,0.6,0,1,0,0,0
29,0.365351,0.350877,0.2,0,1,0,0,0
105,0.238951,0.112281,0.2,1,1,0,0,0


In [40]:
y_pred = knn_model.predict(X=features2)

In [41]:
y_pred

array(['Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Female',
       'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male',
       'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male',
       'Male', 'Male', 'Male', 'Male', 'Male', 'Male'], dtype=object)

In [43]:
df.isna().sum()

total_bill     0
tip            0
sex           28
smoker         0
day            0
time           0
size           0
dtype: int64

In [44]:
df3 = df.copy()

In [45]:
# Now fill-up the the missing value those our model is predict
x = [np.nan for x in range(28)]
df3 = df3.replace(to_replace= x, value= y_pred)

In [46]:
df3.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64