## Pandas Series

In [76]:
import pandas as pd
import numpy as np

In [4]:
ser = pd.Series([7,5,3,1,5,9])
ser

0    7
1    5
2    3
3    1
4    5
5    9
dtype: int64

In [6]:
type(ser)

pandas.core.series.Series

In [5]:
ser.index

RangeIndex(start=0, stop=6, step=1)

In [8]:
ser.size

6

In [9]:
ser.ndim

1

In [10]:
ser.values

array([7, 5, 3, 1, 5, 9], dtype=int64)

In [12]:
type(ser.values)

numpy.ndarray

In [13]:
ser.head(2)

0    7
1    5
dtype: int64

## Quick Look at the Data

In [14]:
import seaborn as sns

In [15]:
df = sns.load_dataset("titanic")

In [16]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [17]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [19]:
df.shape

(891, 15)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [22]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [25]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [33]:
df.isnull().values.any() #Is there ant null values ?

True

In [36]:
df.isnull().sum() #How many null values are there in columns ? 

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [37]:
df.sex.value_counts() # Value counts on categorical columns

male      577
female    314
Name: sex, dtype: int64

In [42]:
df.drop([0,1,2],inplace=False) # Changes can effect on df if inplace provided as true
                               # same as df = df.drop(...)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [None]:
# df.reset_index() for adding index column to df columns as a feature.

## Selection 

In [44]:
"age" in df

True

In [47]:
type(df["age"])

pandas.core.series.Series

In [48]:
type(df[["age"]])

pandas.core.frame.DataFrame

In [56]:
df.loc[:,~df.columns.str.contains("age")]

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## loc and iloc

In [63]:
df.loc[0:1,["age"]]

Unnamed: 0,age
0,22.0
1,38.0


In [66]:
df.iloc[0:1,[4]]

Unnamed: 0,sibsp
0,1


In [80]:
df.loc[df["age"] > 50,["age"]].count() #there are 64 people older than 50

age    64
dtype: int64

In [83]:
df.loc[np.logical_and(df["age"] > 50,df["sex"] == "male"),["age"]].count() #there are 47 male older than 50

age    47
dtype: int64

## Aggregation and Grouping

In [86]:
df.loc[:,"age"].mean()

29.69911764705882

In [89]:
df.groupby("sex")["age"].count()

sex
female    261
male      453
Name: age, dtype: int64

In [90]:
df.groupby("sex")["age"].mean()

sex
female    27.915709
male      30.726645
Name: age, dtype: float64

In [98]:
df.groupby("class").agg({'age': ['mean','count'],
                        'survived': 'mean'})

Unnamed: 0_level_0,age,age,survived
Unnamed: 0_level_1,mean,count,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,38.233441,186,0.62963
Second,29.87763,173,0.472826
Third,25.14062,355,0.242363


In [100]:
df.groupby(["sex","embark_town"]).agg({'age' : 'mean',
                                      'survived':'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,age,survived
sex,embark_town,Unnamed: 2_level_1,Unnamed: 3_level_1
female,Cherbourg,28.344262,0.876712
female,Queenstown,24.291667,0.75
female,Southampton,27.771505,0.689655
male,Cherbourg,32.998841,0.305263
male,Queenstown,30.9375,0.073171
male,Southampton,30.29144,0.174603


In [103]:
df.groupby('sex').agg({'age':'mean','survived': 'mean'})

Unnamed: 0_level_0,age,survived
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,27.915709,0.742038
male,30.726645,0.188908


## Pivot Table

In [10]:
df.pivot_table('survived','sex',["embark_town","class"])

embark_town,Cherbourg,Cherbourg,Cherbourg,Queenstown,Queenstown,Queenstown,Southampton,Southampton,Southampton
class,First,Second,Third,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
female,0.976744,1.0,0.652174,1.0,1.0,0.727273,0.958333,0.910448,0.375
male,0.404762,0.2,0.232558,0.0,0.0,0.076923,0.35443,0.154639,0.128302


In [14]:
df["new_age"] = pd.cut(df["age"],[0,10,18,25,40,90]) #numerical to categorical
df["new_age"]

0      (18.0, 25.0]
1      (25.0, 40.0]
2      (25.0, 40.0]
3      (25.0, 40.0]
4      (25.0, 40.0]
           ...     
886    (25.0, 40.0]
887    (18.0, 25.0]
888             NaN
889    (25.0, 40.0]
890    (25.0, 40.0]
Name: new_age, Length: 891, dtype: category
Categories (5, interval[int64, right]): [(0, 10] < (10, 18] < (18, 25] < (25, 40] < (40, 90]]

In [17]:
df.pivot_table("survived","sex","new_age")

new_age,"(0, 10]","(10, 18]","(18, 25]","(25, 40]","(40, 90]"
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.612903,0.72973,0.759259,0.802198,0.770833
male,0.575758,0.131579,0.12037,0.22093,0.176471


## Apply and Lambda

In [20]:
df["fare_2"] = df["fare"]**2

In [23]:
df.loc[:,df.columns.str.contains('fare')].apply(lambda x: x/10)

Unnamed: 0,fare,fare_2
0,0.72500,5.256250
1,7.12833,508.130886
2,0.79250,6.280563
3,5.31000,281.961000
4,0.80500,6.480250
...,...,...
886,1.30000,16.900000
887,3.00000,90.000000
888,2.34500,54.990250
889,3.00000,90.000000


In [35]:
df["sex_2"] = df["sex"]

In [27]:
df.loc[:,df.columns.str.startswith('sex_')].apply(lambda x: x + "_2")

Unnamed: 0,sex_2
0,male_2
1,female_2
2,female_2
3,female_2
4,male_2
...,...
886,male_2
887,female_2
888,female_2
889,male_2


In [36]:
df.drop(columns="sex_2", inplace=True)

## Join 

In [40]:
df = pd.DataFrame(np.random.randint(0,100,(5,3)),columns=["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,35,61,18
1,18,53,49
2,80,98,78
3,96,20,62
4,6,53,76


In [44]:
df2 = df * 0.01
df2

Unnamed: 0,var1,var2,var3
0,0.35,0.61,0.18
1,0.18,0.53,0.49
2,0.8,0.98,0.78
3,0.96,0.2,0.62
4,0.06,0.53,0.76


In [49]:
df3 = pd.concat([df,df2],ignore_index=True)
df3

Unnamed: 0,var1,var2,var3
0,35.0,61.0,18.0
1,18.0,53.0,49.0
2,80.0,98.0,78.0
3,96.0,20.0,62.0
4,6.0,53.0,76.0
5,0.35,0.61,0.18
6,0.18,0.53,0.49
7,0.8,0.98,0.78
8,0.96,0.2,0.62
9,0.06,0.53,0.76
