# Основы pandas. Манипуляция с данными

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sleep75.csv')

In [4]:
df.dtypes

age           int64
black         int64
case          int64
clerical    float64
construc    float64
educ          int64
earns74       int64
gdhlth        int64
inlf          int64
leis1         int64
leis2         int64
leis3         int64
smsa          int64
lhrwage     float64
lothinc     float64
male          int64
marr          int64
prot          int64
rlxall        int64
selfe         int64
sleep         int64
slpnaps       int64
south         int64
spsepay       int64
spwrk75       int64
totwrk        int64
union         int64
worknrm       int64
workscnd      int64
exper         int64
yngkid        int64
yrsmarr       int64
hrwage      float64
agesq         int64
dtype: object

## Отбор столбцов в новый датафрейм

In [29]:
df_new = df[['sleep', 'totwrk', 'age', 'male', 'union']]
df_new

Unnamed: 0,sleep,totwrk,age,male,union
0,3113,3438,32,1,0
1,2920,5020,31,1,0
2,2670,2815,44,1,0
3,3083,3786,30,0,0
4,3448,2580,64,1,0
...,...,...,...,...,...
701,2985,2026,45,0,0
702,3520,675,34,1,1
703,3510,1851,37,0,0
704,2970,1961,54,0,1


In [10]:
# альтернативно через .loc. ':' означает полный диапазон по строкам (индексу)
df.loc[:,['sleep', 'totwrk', 'age', 'male', 'union']]

Unnamed: 0,sleep,totwrk,age,male,union
0,3113,3438,32,1,0
1,2920,5020,31,1,0
2,2670,2815,44,1,0
3,3083,3786,30,0,0
4,3448,2580,64,1,0
...,...,...,...,...,...
701,2985,2026,45,0,0
702,3520,675,34,1,1
703,3510,1851,37,0,0
704,2970,1961,54,0,1


## Фильтрация данных

Отберём те наблюдения, для которых `male` равно 1. Для этого используем логическтй опертор сраневния `==` (**не путать с `=` !**)

Два способа:
- `DataFrame[logic_condition]`
- `DataFrame.query(logic_condition)`

In [11]:
df_new[ df_new['male']==1 ]

Unnamed: 0,sleep,totwrk,age,male,union
0,3113,3438,32,1,0
1,2920,5020,31,1,0
2,2670,2815,44,1,0
4,3448,2580,64,1,0
5,4063,1205,41,1,0
...,...,...,...,...,...
692,3060,2603,50,1,0
694,3028,2513,44,1,0
695,3023,2418,29,1,0
699,2993,2725,61,1,0


In [14]:
# альтернативно
df_new.query(' `male`==1 ')

Unnamed: 0,sleep,totwrk,age,male,union
0,3113,3438,32,1,0
1,2920,5020,31,1,0
2,2670,2815,44,1,0
4,3448,2580,64,1,0
5,4063,1205,41,1,0
...,...,...,...,...,...
692,3060,2603,50,1,0
694,3028,2513,44,1,0
695,3023,2418,29,1,0
699,2993,2725,61,1,0


Сколько таких наблюдений? Посчитаем явно

In [24]:
df_new[ df_new['male']==1 ].shape

(400, 5)

Отберём теперь те наблюдения, для которых `male` равно 1 и `age`>50 (обратим внимание на индекс)

In [18]:
df_new[ (df_new['male']==1) & (df_new['age']>50) ]

Unnamed: 0,sleep,totwrk,age,male,union
4,3448,2580,64,1,0
18,3420,2300,53,1,0
22,3470,2506,53,1,0
25,2873,3588,53,1,1
29,3238,3173,59,1,0
...,...,...,...,...,...
670,3200,2363,58,1,0
680,3630,2300,54,1,0
681,3150,1188,64,1,0
688,3073,2351,57,1,1


In [19]:
df_new.query(' `male`==1 & `age`>50 ')

Unnamed: 0,sleep,totwrk,age,male,union
4,3448,2580,64,1,0
18,3420,2300,53,1,0
22,3470,2506,53,1,0
25,2873,3588,53,1,1
29,3238,3173,59,1,0
...,...,...,...,...,...
670,3200,2363,58,1,0
680,3630,2300,54,1,0
681,3150,1188,64,1,0
688,3073,2351,57,1,1


Вопрос: сколько наблюдения для которых male=1, union=0, totwrk<2500?

Ответ

In [22]:
df_new[ (df_new['male']==1) & (df_new['union']==0) & (df_new['totwrk']<2500)  ].shape

(157, 5)

## Добавление столбцов

добавим столбец с квадратом переменной age с названием age2

In [35]:
df_new.loc[:, 'age2'] = df_new['age']**2
df_new.head()

Unnamed: 0,sleep,totwrk,age,male,union,age2
0,3113,3438,32,1,0,1024
1,2920,5020,31,1,0,961
2,2670,2815,44,1,0,1936
3,3083,3786,30,0,0,900
4,3448,2580,64,1,0,4096


In [36]:
# альтернативно
df_new['totwrk2'] = df_new['totwrk']**2
df_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['totwrk2'] = df_new['totwrk']**2


Unnamed: 0,sleep,totwrk,age,male,union,age2,totwrk2
0,3113,3438,32,1,0,1024,11819844
1,2920,5020,31,1,0,961,25200400
2,2670,2815,44,1,0,1936,7924225
3,3083,3786,30,0,0,900,14333796
4,3448,2580,64,1,0,4096,6656400
