In [8]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/data/insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
5,31,female,25.740,0,no,southeast,3756.62160
6,46,female,33.440,1,no,southeast,8240.58960
7,37,female,27.740,3,no,northwest,7281.50560
8,37,male,29.830,2,no,northeast,6406.41070
9,60,female,25.840,0,no,northwest,28923.13692


In [3]:
type(df)

pandas.core.frame.DataFrame

In [4]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [5]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
type(df.age)

pandas.core.series.Series

In [10]:
df.age.astype(np.float32).dtype

dtype('float32')

# What we would like to do?

1. Columns 
2. Column types
3. How many records
4. Do we have null in the dataset?
5. Which rows or which columns have null?
6. Filter 
7. sorting
8. Subset bases - by index, by name
9. Grouping 
10. Merging or join two dataframes
11. Enrichment - create derived columns



In [11]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [13]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB


In [15]:
df = pd.read_csv("/data/mobile-sales-data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [16]:
df.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [17]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [18]:
df.isnull().sum(axis = 1)

0    0
1    0
2    0
3    0
4    1
5    0
6    1
7    0
8    0
9    0
dtype: int64

In [19]:
df[df.isnull().sum(axis = 1) > 0]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [21]:
df[df.Age.isnull()]

Unnamed: 0,Country,Age,Salary,Purchased
6,Spain,,52000.0,No


In [25]:
is_null = df.apply(lambda r: np.isnan(r.Age) 
                   or np.isnan(r.Salary), axis = 1)
is_null

0    False
1    False
2    False
3    False
4     True
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [24]:
df[is_null]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [27]:
df.fillna(df.mean())

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
df1 = df.fillna({"Age": 20, "Salary": 10000})
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,10000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,20.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [31]:
df.fillna({"Age": 20, "Salary": 10000}, inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,10000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,20.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [32]:
df[df.Country == "France"]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
9,France,37.0,67000.0,Yes


In [33]:
df.query("Country == 'France'")

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
9,France,37.0,67000.0,Yes


In [45]:
df.sort_values(["Country", "Salary"], ascending=[False, True])

Unnamed: 0,Country,Age,Salary,Purchased
1,Spain,27.0,48000.0,Yes
6,Spain,20.0,52000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,10000.0,Yes
2,Germany,30.0,54000.0,No
8,Germany,50.0,83000.0,No
5,France,35.0,58000.0,Yes
9,France,37.0,67000.0,Yes
0,France,44.0,72000.0,No
7,France,48.0,79000.0,Yes


In [49]:
df.iloc[3: 6, :]

Unnamed: 0,Country,Age,Salary,Purchased
3,Spain,38.0,61000.0,No
4,Germany,40.0,10000.0,Yes
5,France,35.0,58000.0,Yes


In [51]:
df.iloc[3: 6, 0:3]

Unnamed: 0,Country,Age,Salary
3,Spain,38.0,61000.0
4,Germany,40.0,10000.0
5,France,35.0,58000.0


In [52]:
df[["Country", "Age"]]

Unnamed: 0,Country,Age
0,France,44.0
1,Spain,27.0
2,Germany,30.0
3,Spain,38.0
4,Germany,40.0
5,France,35.0
6,Spain,20.0
7,France,48.0
8,Germany,50.0
9,France,37.0


In [56]:
df.groupby("Country").Age.mean()

Country
France     41.000000
Germany    40.000000
Spain      28.333333
Name: Age, dtype: float64

In [59]:
df.groupby("Country").Age.agg([len, np.mean]).reset_index()

Unnamed: 0,Country,len,mean
0,France,4.0,41.0
1,Germany,3.0,40.0
2,Spain,3.0,28.333333


In [61]:
df = pd.read_csv("/data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [62]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [64]:
df["senior"] = np.where(df.age >= 60, True, False)
df.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,senior
895,61,female,44.0,0,no,southwest,13063.883,True
857,25,male,24.13,0,yes,northwest,15817.9857,False
374,20,male,33.33,0,no,southeast,1391.5287,False
906,27,male,32.585,3,no,northeast,4846.92015,False
256,56,male,33.63,0,yes,northwest,43921.1837,False
1078,28,male,31.68,0,yes,southeast,34672.1472,False
429,27,female,30.4,3,no,northwest,18804.7524,False
1035,54,female,23.0,3,no,southwest,12094.478,False
451,30,male,24.13,1,no,northwest,4032.2407,False
1149,42,male,34.1,0,no,southwest,5979.731,False


In [None]:
df["senior"] = df.app np.where(df.age >= 60, True, False)
df.sample(10)