# Ex03 Selects and aggregations

In [22]:
import pandas as pd

## load json

In [23]:
data = pd.read_json('data/auto.json', 
                    orient = 'records')
data = data.set_index('CarNumber')
data.head()

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.0,Ford,Focus
E432XX77RUS,1,6500.0,Toyota,Camry
7184TT36RUS,1,2100.0,Ford,Focus
X582HE161RUS,2,2000.0,Ford,Focus
92918M178RUS,1,5700.0,Ford,Focus


## selects

In [24]:
data[data.Fines > 2100]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
E432XX77RUS,1,6500.000000,Toyota,Camry
92918M178RUS,1,5700.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
E40577152RUS,1,8594.586466,Ford,Focus
...,...,...,...,...
O718MM163RUS,2,8594.586466,Ford,Focus
7065C8197RUS,2,11400.000000,Volkswagen,Passat
O22097197RUS,1,24300.000000,Ford,Focus
M0309X197RUS,1,22300.000000,Ford,Focus


In [25]:
data[(data.Fines > 2100) & (data.Refund == 2)]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
707987163RUS,2,2200.000000,Ford,Focus
K330T8197RUS,2,8200.000000,Skoda,Octavia
M592CH197RUS,2,8594.586466,Skoda,Octavia
...,...,...,...,...
O136HO197RUS,2,7800.000000,Toyota,Corolla
O68897197RUS,2,12300.000000,Ford,Focus
O718MM163RUS,2,8594.586466,Ford,Focus
7065C8197RUS,2,11400.000000,Volkswagen,Passat


In [26]:
data[data.Model.isin(['Focus','Corolla'])]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
7184TT36RUS,1,2100.000000,Ford,Focus
X582HE161RUS,2,2000.000000,Ford,Focus
92918M178RUS,1,5700.000000,Ford,Focus
H234YH197RUS,2,6000.000000,Ford,Focus
...,...,...,...,...
Y163O8161RUS,2,1600.000000,Ford,Focus
M0309X197RUS,1,22300.000000,Ford,Focus
O673E8197RUS,2,600.000000,Ford,Focus
8610T8154RUS,1,2000.000000,Ford,Focus


In [27]:
numbers = ['Y7689C197RUS', '92928M178RUS', '7788KT197RUS', 'H115YO163RUS', 'X758HY197RUS']
data[data.index.isin(numbers)]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92928M178RUS,1,8594.586466,Ford,Focus
H115YO163RUS,1,2200.0,Ford,Focus
7788KT197RUS,2,12000.0,Ford,Focus
X758HY197RUS,2,24200.0,Ford,Focus
X758HY197RUS,2,72600.0,Ford,Focus
Y7689C197RUS,1,27000.0,Ford,Focus
92928M178RUS,1,600.0,Ford,Focus
H115YO163RUS,1,8594.586466,Ford,Focus
H115YO163RUS,2,1100.0,Ford,Focus
7788KT197RUS,2,8594.586466,Ford,Focus


## aggregations with make and models

In [28]:
data.groupby('Make').Fines.mean()

Make
Audi           4200.000000
BMW            6031.528822
Ford           8270.665045
Skoda         10586.710526
Toyota         9478.934100
Volkswagen     9929.970462
Volvo          8500.000000
Name: Fines, dtype: float64

In [29]:
data.groupby(['Make','Model'], as_index = False).Fines.mean()

Unnamed: 0,Make,Model,Fines
0,Audi,,4200.0
1,BMW,,6031.528822
2,Ford,Focus,8184.445897
3,Ford,Mondeo,16533.333333
4,Skoda,Octavia,10586.710526
5,Toyota,Camry,8236.823308
6,Toyota,Corolla,10583.032581
7,Volkswagen,,5533.333333
8,Volkswagen,Golf,15744.729323
9,Volkswagen,Jetta,10350.0


In [30]:
# conclusion: we have not enough samples for some models for true mean calculation
data.groupby(['Make','Model'], as_index = False).Fines.count()

Unnamed: 0,Make,Model,Fines
0,Audi,,1
1,BMW,,3
2,Ford,Focus,575
3,Ford,Mondeo,6
4,Skoda,Octavia,48
5,Toyota,Camry,16
6,Toyota,Corolla,18
7,Volkswagen,,3
8,Volkswagen,Golf,20
9,Volkswagen,Jetta,6


In [31]:
data.groupby(['Make','Model'], as_index = False) \
    .agg({'Fines' : ['min','max']})

Unnamed: 0_level_0,Make,Model,Fines,Fines
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max
0,Audi,,4200.0,4200.0
1,BMW,,3000.0,8594.586466
2,Ford,Focus,100.0,180000.0
3,Ford,Mondeo,1100.0,46200.0
4,Skoda,Octavia,300.0,145000.0
5,Toyota,Camry,500.0,22400.0
6,Toyota,Corolla,900.0,34300.0
7,Volkswagen,,1300.0,7900.0
8,Volkswagen,Golf,200.0,168000.0
9,Volkswagen,Jetta,500.0,46000.0


In [32]:
data.groupby(['Make','Model'], as_index = False).Fines.std()

Unnamed: 0,Make,Model,Fines
0,Audi,,
1,BMW,,2826.561226
2,Ford,Focus,15041.269437
3,Ford,Mondeo,18987.329108
4,Skoda,Octavia,24339.742174
5,Toyota,Camry,6410.250654
6,Toyota,Corolla,9629.325617
7,Volkswagen,,3674.688195
8,Volkswagen,Golf,36950.83995
9,Volkswagen,Jetta,17743.026799


## aggregations with number

In [33]:
data.groupby('CarNumber').Fines.count().sort_values(ascending=False)

CarNumber
Y7689C197RUS    4
7788KT197RUS    4
92928M178RUS    4
Y7129Y50RUS     3
X758HY197RUS    3
               ..
Y967O8197RUS    1
Y965O8197RUS    1
Y965EE197RUS    1
Y964O8197RUS    1
Y964EE197RUS    1
Name: Fines, Length: 531, dtype: int64

In [34]:
most_fines_number = data.groupby('CarNumber').Fines.count().sort_values(ascending=False).idxmax()
most_fines_number

'Y7689C197RUS'

In [35]:
data[data.index == most_fines_number]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y7689C197RUS,1,27000.0,Ford,Focus
Y7689C197RUS,2,9000.0,Ford,Focus
Y7689C197RUS,2,45000.0,Ford,Focus
Y7689C197RUS,1,36000.0,Ford,Focus


In [36]:
data.groupby('CarNumber').Fines.sum().sort_values(ascending=False)

CarNumber
X758HY197RUS    242000.0
9020YC197RUS    217500.0
M0279X197RUS    216000.0
Y352O8197RUS    207200.0
Y778EE197RUS    192000.0
                  ...   
Y166O8161RUS       100.0
K326T8197RUS       100.0
Y195O8161RUS       100.0
C58078163RUS       100.0
705787163RUS       100.0
Name: Fines, Length: 531, dtype: float64

In [37]:
most_sum_fines_number = data.groupby('CarNumber').Fines.sum().sort_values(ascending=False).idxmax()
most_sum_fines_number

'X758HY197RUS'

In [38]:
data[data.index == most_sum_fines_number]

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
X758HY197RUS,2,24200.0,Ford,Focus
X758HY197RUS,2,72600.0,Ford,Focus
X758HY197RUS,2,145200.0,Ford,Focus


In [39]:
data.groupby('CarNumber').Model.nunique().sort_values(ascending=False)

CarNumber
E316EH197RUS    2
O39997197RUS    1
O421OT161RUS    1
O481OH77RUS     1
O49797197RUS    1
               ..
C420X938RUS     1
C422X938RUS     1
C476M7161RUS    1
C477M7161RUS    1
C313MM99RUS     1
Name: Model, Length: 531, dtype: int64

In [40]:
# Indeed, the number is associated with two different models, 
# but perhaps the model was lost in the first line
data[data.index == 'E316EH197RUS']

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E316EH197RUS,1,1300.0,Volkswagen,
E316EH197RUS,1,1300.0,Volkswagen,Touareg
