### Wczytanie zbiór danych Diet_R.csv

In [28]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/Diet_R.csv", na_values=["", " ", "?", "NA", "NaN", None])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Person        78 non-null     int64  
 1   gender        76 non-null     float64
 2   Age           78 non-null     int64  
 3   Height        78 non-null     int64  
 4   pre.weight    78 non-null     int64  
 5   Diet          78 non-null     int64  
 6   weight6weeks  78 non-null     float64
dtypes: float64(2), int64(5)
memory usage: 4.4 KB


### Wykrywanie obserwacji odstających metodą rozstępu międzykwartylowego (IQR)

In [29]:
def detect_outliers(df):
    outlier_counts = {}
    for col in df.select_dtypes(include=[float,int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_counts[col] = outliers.shape[0]
    print("Outlier counts per column:")
    print(outlier_counts)

detect_outliers(data)

Outlier counts per column:
{'Person': 0, 'gender': 0, 'Age': 0, 'Height': 8, 'pre.weight': 1, 'Diet': 0, 'weight6weeks': 1}


### Wykrywanie braków danych

In [25]:
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 Person          0
gender          2
Age             0
Height          0
pre.weight      0
Diet            0
weight6weeks    0
dtype: int64


### Zastąpnienie brakujących wartości najczęściej występującą wartością kolumny

In [26]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data['gender'] = imputer.fit_transform(data[['gender']])

data


Unnamed: 0,Person,gender,Age,Height,pre.weight,Diet,weight6weeks
0,25,0.0,41,171,60,2,60.0
1,26,0.0,32,174,103,2,103.0
2,1,0.0,22,159,58,1,54.2
3,2,0.0,46,192,60,1,54.0
4,3,0.0,55,170,64,1,63.3
...,...,...,...,...,...,...,...
73,74,1.0,35,183,83,3,80.2
74,75,1.0,49,177,84,3,79.9
75,76,1.0,28,164,85,3,79.7
76,77,1.0,40,167,87,3,77.8


Dla wczytanego zbioru policzyć podstawowe statystyki:
* średnia, mediana, odchylenie standardowe, mediana, 1 i 3 kwartyl
* statystki policzyć dla zbioru jako całości i z podziałem na płeć
* wyniki zapisać do pliku

In [27]:
def q1(x): return x.quantile(0.25)
def q3(x): return x.quantile(0.75)

num_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# overall statistics
overall_stats = data[num_cols].agg(['mean', 'median', 'std', q1, q3]).T
overall_stats.index.name = 'variable'
overall_stats.columns = ['mean', 'median', 'std', 'q1', 'q3']

# statistics by gender
group_stats = data.groupby('gender')[num_cols].agg(['mean', 'median', 'std', q1, q3])
group_stats.columns = [f"{var}_{stat}" for var, stat in group_stats.columns]
group_stats.index.name = 'gender'


overall_stats.to_csv('data/results_overall_stats.csv')
group_stats.to_csv('data/results_stats_by_gender.csv')


group_stats

Unnamed: 0_level_0,Person_mean,Person_median,Person_std,Person_q1,Person_q3,gender_mean,gender_median,gender_std,gender_q1,gender_q3,...,Diet_mean,Diet_median,Diet_std,Diet_q1,Diet_q3,weight6weeks_mean,weight6weeks_median,weight6weeks_std,weight6weeks_q1,weight6weeks_q3
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,33.555556,33.0,21.36326,12.0,55.0,0.0,0.0,0.0,0.0,0.0,...,2.022222,2.0,0.811533,1.0,3.0,64.035556,62.4,8.46367,60.0,68.1
1.0,47.606061,47.0,22.149971,23.0,70.0,1.0,1.0,0.0,1.0,1.0,...,2.060606,2.0,0.826869,1.0,3.0,75.015152,73.9,4.629398,71.6,79.1
