In [142]:
#Pandas Series is just like a python dictionary

#Refersher on dictionaries:
#Dictionaries have keys and values
emp1 = {'ID': 234,
        'Name': 'Bob Dogooder',
        'Gender': 'Male',
        'Title': 'Analyst'}
print(emp1)

{'ID': 234, 'Name': 'Bob Dogooder', 'Gender': 'Male', 'Title': 'Analyst'}


In [143]:
#Dictionaries use Keys to access their associated data Values

emp1['Name']

'Bob Dogooder'

In [169]:
#Import Pandas and Create Series

import pandas as pd
basic = pd.Series(['Bob', 'Jules', 'Anne'])
print(basic)

0      Bob
1    Jules
2     Anne
dtype: object


In [146]:
#Automatic Index with a Series, you can change the index to whatever you want, but the Series will default to numbered index

In [147]:
basic[0:2]

0      Bob
1    Jules
dtype: object

In [148]:
#Change the Index to desired naming convention

index = ['emp1', 'emp2', 'emp3']
basic.index = index
print(basic)

emp1      Bob
emp2    Jules
emp3     Anne
dtype: object


In [149]:
#Pandas is built ontop of Numpy so all of the functionality we covered in previous videos in regard to Numpy apply to Pandas
#Still call items based on the behind the scenes indexed values of Python

basic[1] 

'Jules'

In [150]:
#SLICING WORKS THE SAME WAY TOO

basic[0:2]

emp1      Bob
emp2    Jules
dtype: object

In [151]:
#loc and iloc are more conventional ways in Pandas to call data, also more robust.
#loc returns explicit index

basic.loc['emp1']

'Bob'

In [152]:
#iloc returns implicit index

basic.iloc[0]

'Bob'

In [153]:
#Create a random DataFrame using Numpy

import numpy as np
df = pd.DataFrame(np.random.randint(0,100,size=(10, 4)), index=list([0,10,20,30,40,50,60,70,80,90]), columns=list('ABCD'))
print(df)

     A   B   C   D
0   13  24  43  71
10  11  65  47  45
20  75  77  35  38
30  64   6   5  51
40  38  79  92  70
50  96  57   5  12
60   4  18  47  10
70  12  62  16   9
80  10  20  69  67
90  96   9  73  17


In [154]:
#Call an entrie row with .loc

df.loc[50]

A    96
B    57
C     5
D    12
Name: 50, dtype: int32

In [155]:
#Call entire row with implicit .iloc

df.iloc[2]

A    75
B    77
C    35
D    38
Name: 20, dtype: int32

In [156]:
print(df)

Unnamed: 0,A,B,C,D
0,13,24,43,71
10,11,65,47,45
20,75,77,35,38
30,64,6,5,51
40,38,79,92,70
50,96,57,5,12
60,4,18,47,10
70,12,62,16,9
80,10,20,69,67
90,96,9,73,17


In [157]:
#Check for Null Values in the dataset

df.notnull()

Unnamed: 0,A,B,C,D
0,True,True,True,True
10,True,True,True,True
20,True,True,True,True
30,True,True,True,True
40,True,True,True,True
50,True,True,True,True
60,True,True,True,True
70,True,True,True,True
80,True,True,True,True
90,True,True,True,True


In [170]:
#DEALING WITH NULL VALUES
#impute a null value for demonstration

df.loc[30, 'C'] = np.nan

In [167]:
#Row 30, Column C now has a Null Value

print(df)

Unnamed: 0,A,B,C,D
0,13,24,43.0,71
10,11,65,47.0,45
20,75,77,35.0,38
30,64,6,,51
40,38,79,92.0,70
50,96,57,5.0,12
60,4,18,47.0,10
70,12,62,16.0,9
80,10,20,69.0,67
90,96,9,73.0,17


In [168]:
#Check % of columns that have null values

df.isnull().mean()*100

A     0.0
B     0.0
C    10.0
D     0.0
dtype: float64

In [160]:
#Drop all rows from dataset where there are any null values

new_df = df.dropna(axis = 0, how ='any')
print(new_df)

     A   B     C   D
0   13  24  43.0  71
10  11  65  47.0  45
20  75  77  35.0  38
40  38  79  92.0  70
50  96  57   5.0  12
60   4  18  47.0  10
70  12  62  16.0   9
80  10  20  69.0  67
90  96   9  73.0  17


In [161]:
#Altrnative way of dropping null values (slightly more sophisticated way)
#Set a threshold of rows that need to have data

df.dropna(axis=0, thresh=4)

Unnamed: 0,A,B,C,D
0,13,24,43.0,71
10,11,65,47.0,45
20,75,77,35.0,38
40,38,79,92.0,70
50,96,57,5.0,12
60,4,18,47.0,10
70,12,62,16.0,9
80,10,20,69.0,67
90,96,9,73.0,17


In [162]:
#Begin to gain insights from your data

new_df.mean(axis=0)

A    39.444444
B    45.666667
C    47.444444
D    37.666667
dtype: float64

In [171]:
new_df.mean(axis=1)

0      75.5
10     84.0
20    112.5
40    139.5
50     85.0
60     39.5
70     49.5
80     83.0
90     97.5
dtype: float64

In [164]:
#Create a new calculated column in your dataset

new_df['Total'] = df.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [165]:
#New Dataset with calculated column

print(new_df)

Unnamed: 0,A,B,C,D,Total
0,13,24,43.0,71,151.0
10,11,65,47.0,45,168.0
20,75,77,35.0,38,225.0
40,38,79,92.0,70,279.0
50,96,57,5.0,12,170.0
60,4,18,47.0,10,79.0
70,12,62,16.0,9,99.0
80,10,20,69.0,67,166.0
90,96,9,73.0,17,195.0
