In [1]:
# The DataFrame data structure is the heart of the Panda's library.
# The DataFrame is conceptually a two-dimensional series object, where there's
# an index and multiple columns of content, with each column having a label.
# DataFrame can be though of a two-axes labeled array.

In [2]:
import pandas as pd

In [3]:
record1 = pd.Series({'Name': 'Alice',
                    'Class': 'Physics',
                    'Score': 85})
record2 = pd.Series({'Name': 'Jack',
                    'Class': 'Chemistry',
                    'Score': 90})
record3 = pd.Series({'Name': 'Helen',
                    'Class': 'Biology',
                    'Score': 82})


In [4]:
#  So, like a series the DataFrame object is indexed. We'll use a group of 
# series, where each series represents a row of data. Just like the series 
# function, we can pass in our individual items in an array and we can pass in
# our index values as second arguments.

df = pd.DataFrame([record1, record2, record3],
                 index =['school1', 'school2', 'school1'])

# Just like the series, we can use the head function to see the first several
# rows of the DataFrame, including indices from both axis, and we can use this 
# to verify the columns and the rows
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,90
school1,Helen,Biology,82


In [5]:
# An alternative method is that we could use a list of dictionaries, where each
# dictionary epresents a row of data

students = [{'Name': 'Alice',
            'Class': 'Physics',
            'Score': 85},
             {'Name': 'Jack',
             'Class': 'Chemistry',
              'Score': 90},
              {'Name': 'Helen',
               'Class': 'Biology',
                'Score': 82}]
# Then we pass the list of dictionaries into the DataFrame function
df = pd.DataFrame(students, index = ['school1', 'school2', 'school1'])
# And lets print the head again
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,90
school1,Helen,Biology,82


In [6]:
# So similar to the series, we can extract data using the.iloc and.loc 
# attributes. Because the DataFrame is two-dimensional, passing a single value 
# to loc indexing operator will return the series if there's only one row to
# return.

#  For instance, if we wanted to select data associated with school2, we would
# just query the.loc attribute with one parameter. 
df.loc['school2']

Name          Jack
Class    Chemistry
Score           90
Name: school2, dtype: object

In [7]:
# We can checkm the data type of the return using the python type function

type(df.loc['school2'])

pandas.core.series.Series

In [8]:
# It's important to remember that the indices and column names along either
# axis horizontal or vertical, could be non-unique. In this example, we see two
# records for school1 as different rows. If we use a single value with the 
# DataFrame lock attribute, multiple rows of the DataFrame will be return,
# not as a new series, but as a new DataFrame.

# So let's query for school1 records.
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,82


In [9]:
# And we can see that the type of this is different too
type(df.loc['school1'])

pandas.core.frame.DataFrame

In [10]:
# One of the powers of the panda's DataFrame, is that we can quickly select 
# data based on multiple axis. 
# For instance, if we wanted to just list the student names for school1,
# we can supply two parameters to.loc, one being the row index and the other
# being the column name.
df.loc['school1', 'Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [11]:
# If we wanted to just select a single column, there are few mechanisms.
# Firstly, we could trandpose the matrix, this pivots all of the rows into
# columns and all of the columns into rows, and this is done with T atttribute

df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,90,82


In [12]:
# Then, we can call .loc on the transpose to get  the student names only
df.T.loc['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [14]:
# Since iloc and loc are use dfor row selection, Panda reserves the indexing 
# operator directly on the DataFrame for column selection. In Panda's DataFrame
# columns always have a name and this selectoin is always label based.
df['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [15]:
# We get a key error if we try and use .loc with a column name(df.loc['Name'])

In [16]:
# The result of a sinlge column projection is a series object
type(df['Name'])

pandas.core.series.Series

In [17]:
# Since the result of using the indexing operator is either a DataFrame or 
# Series, we can chain operations together. We can select all of the rows which
# related to school1 using .loc then project the name column from just rows

df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [18]:
# We can use type to check the responses from resulting operations
print(type(df.loc['school1'])) # this should be DataFrame
print(type(df.loc['school1']['Name'])) # this should be a Series

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [19]:
# Here is an example where we ask of all the names and scores for all schools 
# using the .loc operator
df.loc[:, ['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,90
school1,Helen,82


In [20]:
# We can use drop function to delete data in DataFrame and Series
# This function takes single parameter, which is indedx or row label
# Drop function doesn't change the DataFrame by default
df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,90


In [21]:
# But if we look at the original DataFrame we see the data is still intact
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,90
school1,Helen,Biology,82


In [24]:
# Lets make a copy of a DataFrame using .copy()
copy_df = df.copy()
# And drop the name of column in this copy
copy_df.drop('Name', inplace=True, axis=1 )
copy_df

Unnamed: 0,Class,Score
school1,Physics,85
school2,Chemistry,90
school1,Biology,82


In [25]:
# There is another way to drop a column and that is directly through the use
# of the indexing operator using the del keyword. This way of droping data,
# however takes immediate effect on the DataFrame and does not require return a view
    
del copy_df['Class'] 
copy_df

Unnamed: 0,Score
school1,85
school2,90
school1,82


In [26]:
df['ClassRanking']= None # this broadcasts the default value to the new column 
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,90,
school1,Helen,Biology,82,
