In [251]:
import pandas as pd

df = pd.read_csv("data.csv")

In [252]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [253]:
print(df.describe())

         Duration       Pulse    Maxpulse    Calories
count   32.000000   32.000000   32.000000   30.000000
mean    68.437500  103.500000  128.500000  304.680000
std     70.039591    7.832933   12.998759   66.003779
min     30.000000   90.000000  101.000000  195.100000
25%     60.000000  100.000000  120.000000  250.700000
50%     60.000000  102.500000  127.500000  291.200000
75%     60.000000  106.500000  132.250000  343.975000
max    450.000000  130.000000  175.000000  479.000000


In [254]:
df.isnull().sum()

Unnamed: 0,0
Duration,0
Date,1
Pulse,0
Maxpulse,0
Calories,2


In [255]:
df.duplicated().sum()

np.int64(1)

In [256]:
print(df.to_string())

    Duration          Date  Pulse  Maxpulse  Calories
0         60  '2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45  '2020/12/05'    117       148     406.0
5         60  '2020/12/06'    102       127     300.0
6         60  '2020/12/07'    110       136     374.0
7        450  '2020/12/08'    104       134     253.3
8         30  '2020/12/09'    109       133     195.1
9         60  '2020/12/10'     98       124     269.0
10        60  '2020/12/11'    103       147     329.3
11        60  '2020/12/12'    100       120     250.7
12        60  '2020/12/12'    100       120     250.7
13        60  '2020/12/13'    106       128     345.3
14        60  '2020/12/14'    104       132     379.3
15        60  '2020/12/15'     98       123     275.0
16        60  '2020/12/16'     98       120     215.2
17        60  '2020/12/17'  

In [257]:
mean_calories = df["Calories"].mean()

df["Calories"] = df["Calories"].fillna(mean_calories)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [258]:
df.drop_duplicates(inplace=True)
df.drop('Date', axis=1, inplace=True)

In [259]:
def remove_outliers_iqr(df):
    df_clean = df.copy()
    for col in df_clean.select_dtypes(include='number').columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        filter = (df_clean[col] >= Q1 - 1.5 * IQR) & (df_clean[col] <= Q3 + 1.5 * IQR)
        df_clean = df_clean[filter]
    return df_clean

In [260]:
df_without_outliers = remove_outliers_iqr(df)

df_without_outliers

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
2,60,103,135,340.0
5,60,102,127,300.0
6,60,110,136,374.0
9,60,98,124,269.0
11,60,100,120,250.7
13,60,106,128,345.3
14,60,104,132,379.3
15,60,98,123,275.0
16,60,98,120,215.2


In [261]:
def min_max_scaling(df, columns):
  df_scaled = df.copy()
  for col in columns:
    min_val = df_scaled[col].min()
    max_val = df_scaled[col].max()
    df_scaled[col] = (df_scaled[col] - min_val) / (max_val - min_val)
  return df_scaled

In [262]:
columns_to_normalization = ["Pulse", "Maxpulse"]

df_scaled = min_max_scaling(df_without_outliers, columns_to_normalization)

df_scaled

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,1.0,0.625,409.1
2,60,0.416667,0.9375,340.0
5,60,0.333333,0.4375,300.0
6,60,1.0,1.0,374.0
9,60,0.0,0.25,269.0
11,60,0.166667,0.0,250.7
13,60,0.666667,0.5,345.3
14,60,0.5,0.75,379.3
15,60,0.0,0.1875,275.0
16,60,0.0,0.0,215.2


In [263]:
df_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 0 to 30
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  18 non-null     int64  
 1   Pulse     18 non-null     float64
 2   Maxpulse  18 non-null     float64
 3   Calories  18 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 720.0 bytes


In [264]:
print(df_scaled.describe())

       Duration      Pulse   Maxpulse    Calories
count      18.0  18.000000  18.000000   18.000000
mean       60.0   0.384259   0.444444  316.348889
std         0.0   0.315890   0.334785   53.647351
min        60.0   0.000000   0.000000  215.200000
25%        60.0   0.166667   0.187500  276.250000
50%        60.0   0.333333   0.468750  313.840000
75%        60.0   0.479167   0.734375  359.475000
max        60.0   1.000000   1.000000  409.100000
