In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('weight-height.csv') # to import and read out the file

In [3]:
df.head() # check the file

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


# Encoding

In [4]:
# Label encoder

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [6]:
df.Gender = le.fit_transform(df[['Gender']]) # fit and transform in a line to encide the string

In [7]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801


# Seperate data 

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# seperate the data and assigned into x and y  variable
x = df.drop('Weight', axis = 1) # x contain Gender and height, it is being done using drop function
y = df[['Weight']] # y contain weight data

In [10]:
x.head() # check x column data

Unnamed: 0,Gender,Height
0,1,73.847017
1,1,68.781904
2,1,74.110105
3,1,71.730978
4,1,69.881796


In [11]:
y.head() # check y column data

Unnamed: 0,Weight
0,241.893563
1,162.310473
2,212.740856
3,220.04247
4,206.349801


In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = .30, random_state = 1) # train and test data and split into 70:30 ratio

In [13]:
xtrain.head()

Unnamed: 0,Gender,Height
1277,1,72.385301
5361,0,60.960147
5728,0,69.119292
4276,1,68.27105
3112,1,64.991405


In [14]:
ytrain.head()

Unnamed: 0,Weight
1277,189.165592
5361,124.876796
5728,168.864403
4276,172.941248
3112,144.551044


In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
reg = LinearRegression() # creat an object

In [17]:
reg.fit(xtrain, ytrain) # fit the xtrain and ytrain data

LinearRegression()

In [18]:
reg.coef_ # find out the coefficient value

array([[19.34359322,  5.96887438]])

In [19]:
reg.intercept_ # find out the intercept value

array([-244.55280903])

In [20]:
xtest.head()

Unnamed: 0,Gender,Height
7154,0,65.068038
3258,1,70.824643
2188,1,68.663594
5609,0,65.98761
3186,1,70.901461


In [21]:
ytest.head()

Unnamed: 0,Weight
7154,160.800726
3258,184.516688
2188,184.262796
5609,166.844114
3186,200.466282


In [22]:
reg.predict(xtest) # predict the weight value considering the height and gender value

array([[143.83013436],
       [197.53417989],
       [184.63515112],
       ...,
       [160.7078525 ],
       [186.82852694],
       [201.4226542 ]])

In [23]:
pred = reg.predict(xtest) 

In [24]:
pred

array([[143.83013436],
       [197.53417989],
       [184.63515112],
       ...,
       [160.7078525 ],
       [186.82852694],
       [201.4226542 ]])

In [25]:
reg.score(xtrain, ytrain) # training accuracy

0.8972135459668118

In [26]:
reg.score(xtest, ytest)  # testing accuracy assessment

0.905995960709116

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [28]:
# actual - predicted
mean_squared_error(ytest, reg.predict(xtest)) # find MSE value

99.67250998070034

In [29]:
from sklearn.metrics import r2_score

In [30]:
r2_score(ytest, pred) # testing accuracy/ alternative method

0.905995960709116

# KNN Regression

In [31]:
from sklearn.neighbors import KNeighborsRegressor # import knn regressor

In [32]:
knn = KNeighborsRegressor(n_neighbors = 100) # k value is 100

In [33]:
knn.fit(xtrain, ytrain) 

KNeighborsRegressor(n_neighbors=100)

In [34]:
pred2 = knn.predict(xtest) # make prediction as before

In [35]:
pred2

array([[142.83375838],
       [198.29567688],
       [185.1495741 ],
       ...,
       [159.8721488 ],
       [185.95631787],
       [201.0705108 ]])

In [36]:
mean_squared_error(ytest, pred2) # find MSE value

104.01856360504236

In [37]:
reg.score(xtest, ytest) # accuracy assessment

0.905995960709116

In [38]:
from sklearn.metrics import r2_score # accuracy assessment

In [39]:
r2_score(ytest, pred2) 

0.9018970712987657

In [40]:
# so in prediction, Linear Regression  perform better compared to KNN regression as the MSE is lower in LR (99.67) compared to KNN (104)

# KNN classifier

In [41]:
x1 = df.drop('Gender', axis = 1) # seperating data
y1 = df[['Gender']]

In [42]:
x1.head()

Unnamed: 0,Height,Weight
0,73.847017,241.893563
1,68.781904,162.310473
2,74.110105,212.740856
3,71.730978,220.04247
4,69.881796,206.349801


In [43]:
y1.head()

Unnamed: 0,Gender
0,1
1,1
2,1
3,1
4,1


In [44]:
x1train, x1test, y1train, y1test = train_test_split(x1,y1, test_size = .30, random_state = 1)

In [45]:
x1train.head()

Unnamed: 0,Height,Weight
1277,72.385301,189.165592
5361,60.960147,124.876796
5728,69.119292,168.864403
4276,68.27105,172.941248
3112,64.991405,144.551044


In [46]:
y1train.head()

Unnamed: 0,Gender
1277,1
5361,0
5728,0
4276,1
3112,1


In [47]:
x1test.head()

Unnamed: 0,Height,Weight
7154,65.068038,160.800726
3258,70.824643,184.516688
2188,68.663594,184.262796
5609,65.98761,166.844114
3186,70.901461,200.466282


In [48]:
y1test.head()

Unnamed: 0,Gender
7154,0
3258,1
2188,1
5609,0
3186,1


In [49]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 100)

In [50]:
#Train the model using the training set
model.fit(x1train, y1train)

KNeighborsClassifier(n_neighbors=100)

In [51]:
pred3 = model.predict([[70.901461, 200.466282]]) # height 70.901461 and Weight 200.466282

In [52]:
pred3 # predicted result

array([1])

In [53]:
pred3 = model.predict(x1test)

In [54]:
pred3

array([1, 1, 1, ..., 0, 1, 1])

In [55]:
r2_score(y1test, pred3) # accuracy assessment 

0.6636682213696432