In [1]:
import numpy as np
import pandas as pd

In [2]:
np.set_printoptions(precision=2)

**Считываем данные из файла**

In [3]:
data = pd.read_csv('../../data/beauty.csv', sep=';')

In [4]:
type(data)

pandas.core.frame.DataFrame

**Смотрим на первые 5 строк**

In [5]:
data.head()

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
0,5.73,30,0,1,0,1,1,1,14,4
1,4.28,28,0,1,0,1,1,0,12,3
2,7.96,35,0,1,0,1,0,0,10,4
3,11.57,38,0,1,0,0,1,1,16,3
4,11.42,27,0,1,0,0,1,0,16,3


In [6]:
data.shape

(1260, 10)

**Краткая статистика – info и describe**

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 10 columns):
wage        1260 non-null float64
exper       1260 non-null int64
union       1260 non-null int64
goodhlth    1260 non-null int64
black       1260 non-null int64
female      1260 non-null int64
married     1260 non-null int64
service     1260 non-null int64
educ        1260 non-null int64
looks       1260 non-null int64
dtypes: float64(1), int64(9)
memory usage: 98.5 KB


In [8]:
data.describe()

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
count,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0
mean,6.30669,18.206349,0.272222,0.933333,0.07381,0.346032,0.69127,0.27381,12.563492,3.185714
std,4.660639,11.963485,0.44528,0.249543,0.261564,0.475892,0.462153,0.446089,2.624489,0.684877
min,1.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0
25%,3.7075,8.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,3.0
50%,5.3,15.0,0.0,1.0,0.0,0.0,1.0,0.0,12.0,3.0
75%,7.695,27.0,1.0,1.0,0.0,1.0,1.0,1.0,13.0,4.0
max,77.72,48.0,1.0,1.0,1.0,1.0,1.0,1.0,17.0,5.0


**Индексация**

In [9]:
data['exper'].head()

0    30
1    28
2    35
3    38
4    27
Name: exper, dtype: int64

**loc и iloc**

In [10]:
data.loc[0:5, ['wage', 'female']]

Unnamed: 0,wage,female
0,5.73,1
1,4.28,1
2,7.96,1
3,11.57,0
4,11.42,0
5,3.91,1


In [11]:
data.iloc[:,2:4].head()

Unnamed: 0,union,goodhlth
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


**Логическая индексация**

In [12]:
data[data['female'] == 1]['wage'].mean(), \
data[data['female'] == 0]['wage'].mean()

(4.299357798165136, 7.3688228155339734)

In [13]:
data[(data['female'] == 0) & (data['married'] == 1)]['wage'].median(), \
data[(data['female'] == 0) & (data['married'] == 0)]['wage'].median()

(6.710000000000001, 5.0649999999999995)

**Groupby**

In [14]:
for look, sub_df in data.groupby('looks'):
    print(look)
    
    # что угодно
    print(sub_df['goodhlth'].mean())

1
0.8461538461538461
2
0.9366197183098591
3
0.9210526315789473
4
0.9560439560439561
5
1.0


In [15]:
data.groupby('looks')[['wage', 'exper']].agg(np.median)

Unnamed: 0_level_0,wage,exper
looks,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.46,32.0
2,4.595,18.0
3,5.635,18.0
4,5.24,12.5
5,4.81,8.0


**Сводная таблица**

In [16]:
pd.crosstab(data['female'], data['married'])

married,0,1
female,Unnamed: 1_level_1,Unnamed: 2_level_1
0,166,658
1,223,213


In [17]:
pd.crosstab(data['female'], data['looks'])

looks,1,2,3,4,5
female,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8,88,489,228,11
1,5,54,233,136,8


**Добавление столбцов (построение признаков)**

In [18]:
data['is_rich'] = (data['wage'] > 
                   data['wage'].quantile(.75)).astype('int64')

In [19]:
data.head()

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks,is_rich
0,5.73,30,0,1,0,1,1,1,14,4,0
1,4.28,28,0,1,0,1,1,0,12,3,0
2,7.96,35,0,1,0,1,0,0,10,4,1
3,11.57,38,0,1,0,0,1,1,16,3,1
4,11.42,27,0,1,0,0,1,0,16,3,1


In [20]:
data['rubbish'] = .56 * data['wage'] + 0.32 * data['exper']

**map и apply**

In [21]:
def string_gender(female):
    return 'female' if female else 'male'

In [22]:
d =  {1: 'union', 0: 'non-union'}

In [23]:
data['union'].map(d).head()

0    non-union
1    non-union
2    non-union
3    non-union
4    non-union
Name: union, dtype: object

In [24]:
data['female'].apply(lambda female: 'female' if female else 'male').head()

0    female
1    female
2    female
3      male
4      male
Name: female, dtype: object