# 1. Базовые понятия статистики

### Задание 1. Загрузка данных
Изучить представленный набор данных на основе описания его столбцов, загрузить его и оставить 8 столбцов для дальнейшего изучения: surgery?, Age, rectal temperature, pulse, respiratory rate, temperature of extremities, pain, outcome.

In [None]:
import pandas as pd
import numpy as np

In [None]:
f = pd.read_csv('https://raw.githubusercontent.com/obulygin/pyda_homeworks/master/statistics_basics/horse_data.csv')
f.columns = ['surgery?', 'age', 'Hospital Number', 'rectal temperature', 'pulse', 'respiratory rate', 'temperature of extremities', 'peripheral pulse', 'mucous membranes', 'capillary refill time', 'pain', 'peristalsis', 'abdominal distension', 'nasogastric tube', 'nasogastric reflux', 'nasogastric reflux PH', 'rectal examination', 'abdomen', 'packed cell volume', 'total protein', 'abdominocentesis appearance', 'abdomcentesis total protein', 'outcome', 'surgical lesion?', 'type of lesion', 'type of lesion', 'type of lesion', 'cp_data']
stat = f[['surgery?', 'age', 'rectal temperature', 'pulse', 'respiratory rate', 'temperature of extremities', 'pain', 'outcome']]
df = stat.replace('?', np.nan)
df

### Задание 2. Первичное изучение данных
Проанализировать значения по столбцам, рассчитать базовые статистики, найти выбросы.

In [None]:
df.info()
df['surgery?'] = df['surgery?'].astype('float64')
df['rectal temperature'] = df['rectal temperature'].astype('float64')
df['pulse'] = df['pulse'].astype('float64')
df['respiratory rate'] = df['respiratory rate'].astype('float64')
df['temperature of extremities'] = df['temperature of extremities'].astype('float64')
df['pain'] = df['pain'].astype('float64')
df['outcome'] = df['outcome'].astype('float64')

In [None]:
df.describe()

In [None]:
#mean, std, min, max, quantiles and median are above.
#range
#I doubt that surgery, pain and outcome ranges are that significant
age_range = df['age'].max() - df['age'].min()
rect_range = df['rectal temperature'].max() - df['rectal temperature'].min()
pulse_range = df['pulse'].max() - df['pulse'].min()
resp_range = df['respiratory rate'].max() - df['respiratory rate'].min()
temp_range = df['temperature of extremities'].max() - df['temperature of extremities'].min()
print(f'Age range: {age_range}')
print(f'Rectal Temperature range: {rect_range}')
print(f'Pulse range: {pulse_range}')
print(f'Respiratory Rate range: {resp_range}')
print(f'Temperature of Extremities range: {temp_range}')

In [None]:
#mode
print('Surgery mode: ', df['surgery?'].round().mode()[0])
print('Age mode: ', df['age'].round().mode()[0])
print('Rectal Temperature mode: ', df['rectal temperature'].round().mode()[0])
print('Pulse mode: ', df['pulse'].round().mode()[0])
print('Respiratory Rate mode: ', df['respiratory rate'].round().mode()[0])
print('Temperature of Extremities mode: ', df['temperature of extremities'].round().mode()[0])
print('Pain mode: ', df['pain'].round().mode()[0])
print('Outcome mode: ', df['outcome'].round().mode()[0])

In [None]:
#dispersion
print('Surgery dispersion: ', df['surgery?'].var())
print('Age dispersion: ', df['age'].var())
print('Rectal Temperature dispersion: ', df['rectal temperature'].var())
print('Pulse dispersion: ', df['pulse'].var())
print('Respiratory Rate dispersion: ', df['respiratory rate'].var())
print('Temperature of Extremities dispersion: ', df['temperature of extremities'].var())
print('Pain dispersion: ', df['pain'].round().var())
print('Outcome dispersion: ', df['outcome'].round().var())

In [None]:
#outliers:
#surgery - seems ok
#age - seems ok
#rect temp - seems ok
#pulse - seems odd
#resp rate - seems odd
#temp of extr - seems ok
#pain - seems ok
#outcome - seems ok

In [None]:
pulse_q1 = df['pulse'].quantile(0.25)
pulse_q3 = df['pulse'].quantile(0.75)
pulse_iqr = pulse_q3 - pulse_q1
pulse_lower_bound = pulse_q1 - (1.5 * pulse_iqr) 
pulse_upper_bound = pulse_q3 + (1.5 * pulse_iqr)
pulse_outliers = df[~df['pulse'].between(pulse_lower_bound, pulse_upper_bound, inclusive=True)]

pulse_outliers['pulse'].dropna()

In [None]:
resp_q1 = df['respiratory rate'].quantile(0.25)
resp_q3 = df['respiratory rate'].quantile(0.75)
resp_iqr = resp_q3 - resp_q1
resp_lower_bound = resp_q1 - (1.5 * resp_iqr) 
resp_upper_bound = resp_q3 + (1.5 * resp_iqr)
resp_outliers = df[~df['respiratory rate'].between(resp_lower_bound, resp_upper_bound, inclusive=True)]

resp_outliers['respiratory rate'].dropna()

### Задание 3. Работа с пропусками
Рассчитать количество пропусков для всех выбранных столбцов. Принять и обосновать решение о методе заполнения пропусков по каждому столбцу на основе рассчитанных статистик и возможной взаимосвязи значений в них. Сформировать датафрейм, в котором пропуски будут отсутствовать.

In [None]:
df.isna().sum()

In [None]:
df[df['surgery?'].isnull()]

In [None]:
#considering that that one missing cell for surg also stands for the outcome, I plan to replace NaNs in both columns with Mode
df['surgery?'] = df['surgery?'].fillna(df['surgery?'].mode()[0])
df['outcome'] = df['outcome'].fillna(df['outcome'].mode()[0])
df.info()

In [None]:
#now as we've filled the gaps in 'outcome', we can fill the 'pain' column, for they are related
df['pain'] = df['pain'].fillna(df.groupby('outcome')['pain'].transform('median'))
df.info()

In [None]:
#now as we've filled the gaps in 'pain', we can fill the 'temperature of extremities' column, for they are related
df['temperature of extremities'] = df['temperature of extremities'].fillna(df.groupby('pain')['temperature of extremities'].transform('median'))
df.info()

In [None]:
#now as we've filled the gaps in 'pain', we can fill the 'pulse' column, for they are related
df['pulse'] = df['pulse'].fillna(df.groupby('pain')['pulse'].transform('median'))
df.info()

In [None]:
#I initially thought 'rect temp' may depend on 'pain' but now I think they can't be compared
#so I can't decide how to fill this column, sorry

#PS. Yet above, we fill the 'pulse' column in dependance of the 'pain' though the measures are not the same
#and we can't transform subjective pain scale to be measured against 'beats per min'
#weird but okay
df['rectal temperature'] = df['rectal temperature'].fillna(df.groupby('pain')['rectal temperature'].transform('median'))
df.info()

In [None]:
#resp rate description says: 'usefulness is doubtful due to the great fluctuations'
#so I can drop the whole column I guess

df.drop(['respiratory rate'], axis=1)