# STEPS:
    1- IMPORT LIBRARIES
    2- LOAD DATA
    3- INSIGHTS FROM DATA
    4- PREPROCESSING:
        (A) NULL VALUE TREATMENT
        (B) ENCODING:- CONVERT NUMERICAL INTO CONTINOUS
        (C) SPLIT DATA:- TRAIN AND TEST
        (D) SCALING:- WE CREATE ALL FEATURES ON SAME SCALE
    5- CREATE MODEL
    6- FIT MODEL WITH SPLITTED DATA
    7- PREDICTION
    8- EVALUATION

# (1) IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd


# (2) LOAD DATA

In [2]:
df=pd.read_excel("Data_practice.XLSX")

# (3) INSIGHTS FROM DATA

In [4]:
df.shape

(23, 4)

In [5]:
df.sample(4)

Unnamed: 0,Weight,Height,BMI,look
11,65.0,1.7,22.491349,Fit
10,60.0,1.7,20.761246,Fit
0,60.0,1.5,26.666667,Overweight
13,75.0,1.7,25.951557,Overweight


In [6]:
df.head()

Unnamed: 0,Weight,Height,BMI,look
0,60.0,1.5,26.666667,Overweight
1,65.0,1.5,28.888889,Overweight
2,70.0,1.5,31.111111,Overweight
3,75.0,1.5,33.333333,Overweight
4,80.0,1.5,35.555556,Overweight


In [7]:
df.dtypes

Weight    float64
Height    float64
BMI       float64
look       object
dtype: object

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Weight,22.0,69.772727,6.982508,60.0,65.0,70.0,75.0,80.0
Height,22.0,1.672727,0.13159,1.5,1.6,1.7,1.8,1.9
BMI,23.0,25.018386,5.075479,16.620499,21.183092,24.691358,28.285275,35.555556


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Weight  22 non-null     float64
 1   Height  22 non-null     float64
 2   BMI     23 non-null     float64
 3   look    23 non-null     object 
dtypes: float64(3), object(1)
memory usage: 864.0+ bytes


# (4) PREPROCESSING

# (a) null value treatment

In [12]:
df.isnull().sum()

Weight    1
Height    1
BMI       0
look      0
dtype: int64

# we can impute or drop
(a) mean or median:- if feature are in continous
(b) mode:- feature is categorical

# by imputing

In [13]:
p=df['Weight'].mean()

In [14]:
p

69.77272727272727

In [15]:
df['Weight'].fillna(p,inplace=True)

In [16]:
df.isnull().sum()

Weight    0
Height    1
BMI       0
look      0
dtype: int64

In [17]:
q=df['Height'].mean()

In [18]:
q

1.6727272727272726

In [19]:
df['Height'].fillna(q,inplace=True)

In [20]:
df.isnull().sum()

Weight    0
Height    0
BMI       0
look      0
dtype: int64

# by dropping


In [21]:
df1=pd.read_excel('Data_practice.xlsx')

In [23]:
df1.isnull().sum()

Weight    1
Height    1
BMI       0
look      0
dtype: int64

In [24]:
df1=df1.dropna()

In [25]:
df1.isnull().sum()

Weight    0
Height    0
BMI       0
look      0
dtype: int64

# (b)encoding: converting categorical into numerical/continous

In [26]:
# sklearn library gives us functions for preprocessing, train test splitting,model building,evaluation

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
le=LabelEncoder()

In [30]:
df=df.apply(le.fit_transform)

In [31]:
df.dtypes

Weight    int64
Height    int64
BMI       int64
look      int32
dtype: object

In [32]:
df.sample(5)

Unnamed: 0,Weight,Height,BMI,look
17,3,4,6,0
10,0,3,5,0
14,5,3,16,1
13,4,3,13,1
16,1,4,4,0


# (c) splitting data

# Split data into features(independent variables) and labels(dependent/target variable)

In [33]:
df_new=df.drop(columns=['look'])

In [34]:
df_new.head()

Unnamed: 0,Weight,Height,BMI
0,0,0,14
1,1,0,17
2,3,0,19
3,4,0,21
4,5,0,22


In [37]:
features=df_new.columns[:-1]

In [38]:
features

Index(['Weight', 'Height'], dtype='object')

In [42]:
label=df_new.columns[-1]

In [43]:
label

'BMI'

In [44]:
x=df_new[features]
y=df_new[label]

In [46]:
x,y

(    Weight  Height
 0        0       0
 1        1       0
 2        3       0
 3        4       0
 4        5       0
 5        0       1
 6        1       1
 7        3       1
 8        4       1
 9        5       1
 10       0       3
 11       1       3
 12       3       3
 13       4       3
 14       5       3
 15       0       4
 16       1       4
 17       3       4
 18       4       4
 19       5       4
 20       2       5
 21       1       2
 22       3       5,
 0     14
 1     17
 2     19
 3     21
 4     22
 5      9
 6     12
 7     15
 8     18
 9     20
 10     5
 11     7
 12    10
 13    13
 14    16
 15     2
 16     4
 17     6
 18     8
 19    11
 20     0
 21     1
 22     3
 Name: BMI, dtype: int64)

# split data into train and test

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

In [49]:
x_train.shape

(16, 2)

In [50]:
x_test.shape

(7, 2)

In [51]:
y_train.shape

(16,)

In [52]:
y_test.shape

(7,)

# (d) scaling

In [53]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [54]:
scaler=MinMaxScaler()

In [55]:
x_train=scaler.fit_transform(x_train)

In [56]:
x_train

array([[0.2, 0.6],
       [0.6, 0. ],
       [1. , 0. ],
       [0.8, 0.8],
       [0.6, 0.8],
       [0. , 0.8],
       [0.2, 0.8],
       [0.6, 1. ],
       [0.8, 0.6],
       [0.6, 0.2],
       [1. , 0.2],
       [1. , 0.8],
       [0.8, 0.2],
       [0.2, 0. ],
       [0. , 0.2],
       [1. , 0.6]])

In [58]:
x_test=scaler.fit_transform(x_test)

In [59]:
x_test

array([[0.25, 0.4 ],
       [0.5 , 1.  ],
       [0.  , 0.  ],
       [1.  , 0.  ],
       [0.75, 0.6 ],
       [0.  , 0.6 ],
       [0.25, 0.2 ]])

# (5) CREATE MODEL

In [60]:
from sklearn.linear_model import LinearRegression

In [61]:
model=LinearRegression()

# (6) MODEL FITTING

In [62]:
s1=model.fit(x_train,y_train)

# model attributes

In [63]:
s=s1.score(x_train,y_train)

In [64]:
print("coeff of determination",s)

coeff of determination 0.9730200128930135


In [65]:
print("intrerceptt",s1.intercept_)

intrerceptt 13.481266801075268


In [66]:
print('slope',s1.coef_)

slope [  9.39936156 -15.00714046]


# (7) PREDICTION

In [68]:
y_pred=s1.predict(x_test)

In [69]:
y_pred

array([ 9.82825101,  3.17380712, 13.4812668 , 22.88062836, 11.5265037 ,
        4.47698253, 12.8296791 ])

In [71]:
y_test

21     1
20     0
0     14
3     21
12    10
10     5
6     12
Name: BMI, dtype: int64

In [72]:
df_com=pd.DataFrame({'Actual':y_test,'Predicted':y_pred})

In [73]:
df_com

Unnamed: 0,Actual,Predicted
21,1,9.828251
20,0,3.173807
0,14,13.481267
3,21,22.880628
12,10,11.526504
10,5,4.476983
6,12,12.829679


# (8) MODEL EVALUATION

In [74]:
from sklearn.metrics import mean_squared_error,r2_score

In [75]:
mse=mean_squared_error(y_test,y_pred)

In [76]:
mse

13.587006128624083

In [77]:
r2=r2_score(y_test,y_pred)

In [78]:
r2

0.7202675208812689