In [3]:
import numpy as np
import pandas as pd

In [2]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [3]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

In [4]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [5]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [8]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [12]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


## Concatenation

Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use **pd.concat** and pass in a list of DataFrames to concatenate together:

In [10]:
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [6]:
pd.concat([df1,df2,df3],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [52]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### Info on Unique Values

In [53]:
df['col2'].unique()

array([444, 555, 666])

In [54]:
df['col2'].nunique()

3

In [55]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

### Selecting Data

In [56]:
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['col1']>2) & (df['col2']==444)]

In [57]:
newdf

Unnamed: 0,col1,col2,col3
3,4,444,xyz


### Applying Functions

In [58]:
def times2(x):
    return x*2

In [59]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [60]:
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [61]:
df['col1'].sum()

10

** Permanently Removing a Column**

In [62]:
del df['col1']

In [63]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


** Get column and index names: **

In [64]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [65]:
df.index

RangeIndex(start=0, stop=4, step=1)

** Sorting and Ordering a DataFrame:**

In [66]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [67]:
df.sort_values(by='col2') #inplace=False by default

Unnamed: 0,col2,col3
0,444,abc
3,444,xyz
1,555,def
2,666,ghi


** Find Null Values or Check for Null Values**

In [68]:
df.isnull()

Unnamed: 0,col2,col3
0,False,False
1,False,False
2,False,False
3,False,False


In [69]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [9]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [10]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [12]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [13]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [14]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [15]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [17]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

** Filling in NaN values with something else: **

In [72]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[np.nan,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [75]:
df.fillna('FILL')

Unnamed: 0,col1,col2,col3
0,1,FILL,abc
1,2,555,def
2,3,666,ghi
3,FILL,444,xyz


In [89]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [90]:
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


# Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [56]:
import pandas as pd
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

In [57]:
df = pd.DataFrame(data)

In [58]:
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


You can save this object as a new variable:

In [64]:
by_comp = df.groupby("Company")
by_comp

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000025AC205F9B0>

And then call aggregate methods off the object:

In [61]:
by_comp.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [65]:
by_comp.sum()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [62]:
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


More examples of aggregate methods:

In [38]:
by_comp.std()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065


In [39]:
by_comp.min()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Carl,243
GOOG,Charlie,120
MSFT,Amy,124


In [40]:
by_comp.max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Sarah,350
GOOG,Sam,200
MSFT,Vanessa,340


In [41]:
by_comp.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [68]:
by_comp.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [69]:
by_comp.describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [44]:
by_comp.describe().transpose()['GOOG']

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sales,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0


In [None]:
draft2 = pd.read_table('test.tsv')  # Read a tsv into a DataFrame

draft2.head(6)                           # Check the first 6 rows

In [None]:
draft3 = pd.read_excel('test.xlsx',        # Path to Excel file
                       sheetname = 'sheet1') # Name of sheet to read from

draft3.head(6)                            # Check the first 6 rows

In [2]:
# Go to http://www.basketball-reference.com/leagues/NBA_2015_totals.html
# click the CSV button and then copy some data to the clipboard

BB_reference_data = pd.read_clipboard(sep=",")  # Read data from the clipboard

BB_reference_data.iloc[:, 0:10].head(5)   # Check 5 rows (10 columns only)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA
0,1,Quincy Acy\acyqu01,PF,24,NYK,68,22,1287,152,331
1,2,Jordan Adams\adamsjo01,SG,20,MEM,30,0,248,35,86
2,3,Steven Adams\adamsst01,C,21,OKC,70,67,1771,217,399
3,4,Jeff Adrien\adrieje01,PF,28,MIN,17,0,215,19,44
4,5,Arron Afflalo\afflaar01,SG,29,TOT,78,72,2502,375,884


In [75]:
url = "http://www.basketball-reference.com/leagues/NBA_2015_totals.html"

BB_data = pd.read_html(url)         # Read data from the specified url
BB_data = BB_data[0]

In [76]:
BB_data.iloc[:,:10].head(5)   # Check 5 rows (10 columns only)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA
0,1,Quincy Acy,PF,24,NYK,68,22,1287,152,331
1,2,Jordan Adams,SG,20,MEM,30,0,248,35,86
2,3,Steven Adams,C,21,OKC,70,67,1771,217,399
3,4,Jeff Adrien,PF,28,MIN,17,0,215,19,44
4,5,Arron Afflalo,SG,29,TOT,78,72,2502,375,884


In [133]:
#Write dataframe to file in csv format
BB_data.to_csv("bb_data.csv",index=False)  #By default to_csv will write the records into csv with the Index

In [77]:
BB_data

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Quincy Acy,PF,24,NYK,68,22,1287,152,331,...,.784,79,222,301,68,27,22,60,147,398
1,2,Jordan Adams,SG,20,MEM,30,0,248,35,86,...,.609,9,19,28,16,16,7,14,24,94
2,3,Steven Adams,C,21,OKC,70,67,1771,217,399,...,.502,199,324,523,66,38,86,99,222,537
3,4,Jeff Adrien,PF,28,MIN,17,0,215,19,44,...,.579,23,54,77,15,4,9,9,30,60
4,5,Arron Afflalo,SG,29,TOT,78,72,2502,375,884,...,.843,27,220,247,129,41,7,116,167,1035
5,5,Arron Afflalo,SG,29,DEN,53,53,1750,281,657,...,.841,21,159,180,101,32,5,83,108,771
6,5,Arron Afflalo,SG,29,POR,25,19,752,94,227,...,.851,6,61,67,28,9,2,33,59,264
7,6,Alexis Ajinca,C,26,NOP,68,8,957,181,329,...,.818,104,211,315,47,21,51,69,151,443
8,7,Furkan Aldemir,PF,23,PHI,41,9,540,40,78,...,.481,78,98,176,28,17,16,17,96,93
9,8,Cole Aldrich,C,26,NYK,61,16,976,144,301,...,.781,101,237,338,75,37,65,59,122,338


### Data analysis 

In [134]:
titanic_train = pd.read_csv('D:\\Data\\titanic_train.csv')

In [137]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
3,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
4,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S


In [138]:
titanic_train.shape 

(891, 12)

In [139]:
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [140]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [164]:
categorical = titanic_train.dtypes[titanic_train.dtypes == "object"].index
print("\n Below are Index of Categorical Columns\n",'-'*45, sep='')
print(categorical)


 Below are Index of Categorical Columns
---------------------------------------------
Index(['Name', 'Sex', 'Cabin', 'Embarked'], dtype='object')


In [166]:
###Description of Categorical columns
titanic_train[categorical].describe()

Unnamed: 0,Name,Sex,Cabin,Embarked
count,891,891,204,889
unique,891,2,147,3
top,"Lam, Mr. Ali",male,C23 C25 C27,S
freq,1,577,4,644


In [143]:
sorted(titanic_train["Name"])[0:15]   # Check the first 15 sorted names

['Abbing, Mr. Anthony',
 'Abbott, Mr. Rossmore Edward',
 'Abbott, Mrs. Stanton (Rosa Hunt)',
 'Abelson, Mr. Samuel',
 'Abelson, Mrs. Samuel (Hannah Wizosky)',
 'Adahl, Mr. Mauritz Nils Martin',
 'Adams, Mr. John',
 'Ahlin, Mrs. Johan (Johanna Persdotter Larsson)',
 'Aks, Mrs. Sam (Leah Rosen)',
 'Albimona, Mr. Nassef Cassem',
 'Alexander, Mr. William',
 'Alhomaki, Mr. Ilmari Rudolf',
 'Ali, Mr. Ahmed',
 'Ali, Mr. William',
 'Allen, Miss. Elisabeth Walton']

In [167]:
###Describe on a single column
titanic_train["Name"].describe()

count              891
unique             891
top       Lam, Mr. Ali
freq                 1
Name: Name, dtype: object

In [145]:
titanic_train["Ticket"][0:15]       # Check the first 15 tickets

0     PC 17599
1       113803
2        17463
3       113783
4       113788
5        19950
6     PC 17601
7     PC 17569
8     PC 17604
9       113789
10    PC 17572
11      113509
12       19947
13      113572
14       36973
Name: Ticket, dtype: object

In [146]:
titanic_train["Ticket"].describe()

count      891
unique     681
top       1601
freq         7
Name: Ticket, dtype: object

In [147]:
del titanic_train["Ticket"]        # Remove Ticket

In [148]:
titanic_train["Cabin"][0:15]       # Check the first 15 tickets

0             C85
1            C123
2             E46
3            C103
4              A6
5     C23 C25 C27
6             NaN
7             B78
8             NaN
9             NaN
10            D33
11            B30
12            C52
13            B28
14            C83
Name: Cabin, dtype: object

In [149]:
titanic_train["Cabin"].describe()  # Check number of unique cabins

count             204
unique            147
top       C23 C25 C27
freq                4
Name: Cabin, dtype: object

In [150]:
new_survived = pd.Categorical(titanic_train["Survived"])
new_survived = new_survived.rename_categories(["Died","Survived"])              

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,549,0.616162
Survived,342,0.383838


In [154]:
titanic_train["new_survived"] = new_survived

In [176]:
new_Pclass = pd.Categorical(titanic_train["Pclass"],ordered=True)
new_Pclass = new_Pclass.rename_categories(["Class1","Class2","Class3"])     

new_Pclass.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Class1,216,0.242424
Class2,184,0.20651
Class3,491,0.551066


In [111]:
titanic_train["Pclass"] = new_Pclass

In [112]:
titanic_train["Cabin"].unique()   # Check unique cabins

array(['C85', 'C123', 'E46', 'C103', 'A6', 'C23 C25 C27', nan, 'B78',
       'D33', 'B30', 'C52', 'B28', 'C83', 'E31', 'A5', 'D10 D12', 'D26',
       'C110', 'B58 B60', 'D47', 'B86', 'C2', 'E33', 'B19', 'A7', 'C49',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66',
       'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128',
       'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E44', 'A34', 'C104',
       'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30',
       'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22',
       'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20',
       'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126',
       'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'C62 C64', 'E24', 'C90',
       'C45', 'E8', 'B101', 'D45', 'C46', 'D30', 'D11', 'B3', 'D6',
       'B8

In [None]:
### Lets Create a new Cabin column with Cabin class as 1st letter of Cabin

In [168]:
char_cabin = titanic_train["Cabin"].astype(str) # Convert data to str

new_Cabin = np.array([cabin[0] for cabin in char_cabin]) # Take first letter

new_Cabin = pd.Categorical(new_Cabin)

new_Cabin.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
A,15,0.016835
B,47,0.05275
C,59,0.066218
D,33,0.037037
E,32,0.035915
F,13,0.01459
G,4,0.004489
T,1,0.001122
n,687,0.771044


In [114]:
titanic_train["Cabin"] = new_Cabin

In [115]:
#Check how many null values are there for Age column
titanic_train["Age"].isnull().sum()

177

In [116]:
index = np.where(titanic_train["Fare"] == max(titanic_train["Fare"]) )

titanic_train.loc[index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
49,259,1,Class1,"Ward, Miss. Anna",female,35.0,0,0,512.3292,n,C
163,680,1,Class1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,512.3292,B,C
179,738,1,Class1,"Lesurer, Mr. Gustave J",male,35.0,0,0,512.3292,B,C


In [117]:
titanic_train["Family"] = titanic_train["SibSp"] + titanic_train["Parch"] + 1

most_family = np.where(titanic_train["Family"] == max(titanic_train["Family"]))

titanic_train.loc[most_family]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
499,160,0,Class3,"Sage, Master. Thomas Henry",male,,8,2,69.55,n,S,11
513,181,0,Class3,"Sage, Miss. Constance Gladys",female,,8,2,69.55,n,S,11
524,202,0,Class3,"Sage, Mr. Frederick",male,,8,2,69.55,n,S,11
584,325,0,Class3,"Sage, Mr. George John Jr",male,,8,2,69.55,n,S,11
834,793,0,Class3,"Sage, Miss. Stella Anna",female,,8,2,69.55,n,S,11
867,847,0,Class3,"Sage, Mr. Douglas Bullen",male,,8,2,69.55,n,S,11
876,864,0,Class3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,69.55,n,S,11


In [118]:
titanic_train.groupby(['Family']).count()[['Survived']]

Unnamed: 0_level_0,Survived
Family,Unnamed: 1_level_1
1,537
2,161
3,102
4,29
5,15
6,22
7,12
8,6
11,7


In [162]:
titanic_train.groupby(['Pclass'])['new_survived'].value_counts()

Pclass  new_survived
1       Survived        136
        Died             80
2       Died             97
        Survived         87
3       Died            372
        Survived        119
Name: new_survived, dtype: int64

In [120]:
titanic_train.groupby(['Pclass']).mean()[['Fare']]

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
Class1,84.154687
Class2,20.662183
Class3,13.67555


In [177]:
titanic_train.groupby(['Pclass','new_survived']).count()[['PassengerId']]

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId
Pclass,new_survived,Unnamed: 2_level_1
1,Died,80
1,Survived,136
2,Died,97
2,Survived,87
3,Died,372
3,Survived,119


In [182]:
titanic_train.groupby(['Pclass','new_survived']).count()[['PassengerId']].groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId
Pclass,new_survived,Unnamed: 2_level_1
1,Died,37.037037
1,Survived,62.962963
2,Died,52.717391
2,Survived,47.282609
3,Died,75.763747
3,Survived,24.236253


In [172]:
#C = Cherbourg, Q = Queenstown, S = Southampton
titanic_train.groupby(['Embarked','new_survived']).count()[['PassengerId']]

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId
Embarked,new_survived,Unnamed: 2_level_1
C,Died,75
C,Survived,93
Q,Died,47
Q,Survived,30
S,Died,427
S,Survived,217
