In [19]:
import pandas as pd
import numpy as np

In [5]:
# a dataframe contains an ordered collection of columns, each with different 
# value type. DataFrames has both row and columns index.
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

# dataframe construction
frame = pd.DataFrame(data)

# head() method selects the first 5 rows
print('frame.head(): \n',frame.head())

frame.head(): 
     state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9


In [15]:
# passing new columns it will contain Nan values

frame2 = pd.DataFrame(data, columns=['pop','year','state','debt'], index=['one', 'two', 'three', 'four','five', 'six'])
print('frame2: \n',frame2)
print('columns: \n',frame2.columns)

frame2: 
        pop  year   state debt
one    1.5  2000    Ohio  NaN
two    1.7  2001    Ohio  NaN
three  3.6  2002    Ohio  NaN
four   2.4  2001  Nevada  NaN
five   2.9  2002  Nevada  NaN
six    3.2  2003  Nevada  NaN
columns: 
 Index(['pop', 'year', 'state', 'debt'], dtype='object')


In [16]:
# Columns can be retrieved as Series and can be sliced and indexed

print('Dict notation => frame2[state]: \n',frame2['state'])
print('\n\nAttribute notation => frame2.pop: \n',frame2.pop)

Dict notation => frame2[state]: 
 one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object


Attribute notation => frame2.pop: 
 <bound method DataFrame.pop of        pop  year   state debt
one    1.5  2000    Ohio  NaN
two    1.7  2001    Ohio  NaN
three  3.6  2002    Ohio  NaN
four   2.4  2001  Nevada  NaN
five   2.9  2002  Nevada  NaN
six    3.2  2003  Nevada  NaN>


In [31]:
# columns can be modified ay assignment
frame2.debt = 16.5
print('frame2: \n',frame2)

# column modification by assignment an array
frame2['debt'] = np.arange(6)
print('frame2: \n',frame2)

# selective modification by index assignment
values = pd.Series([0.5,1.8,3.4], index=['four','one','six'])
frame2.debt = values
print('frame2: \n',frame2)

# adding new column of booleans
frame2['eastern'] = frame2.state == 'Ohio'
print('frame2: \n',frame2)

frame2: 
        pop  year   state  debt
one    1.5  2000    Ohio  16.5
two    1.7  2001    Ohio  16.5
three  3.6  2002    Ohio  16.5
four   2.4  2001  Nevada  16.5
five   2.9  2002  Nevada  16.5
six    3.2  2003  Nevada  16.5
frame2: 
        pop  year   state  debt
one    1.5  2000    Ohio     0
two    1.7  2001    Ohio     1
three  3.6  2002    Ohio     2
four   2.4  2001  Nevada     3
five   2.9  2002  Nevada     4
six    3.2  2003  Nevada     5
frame2: 
        pop  year   state  debt
one    1.5  2000    Ohio   1.8
two    1.7  2001    Ohio   NaN
three  3.6  2002    Ohio   NaN
four   2.4  2001  Nevada   0.5
five   2.9  2002  Nevada   NaN
six    3.2  2003  Nevada   3.4
frame2: 
        pop  year   state  debt  eastern
one    1.5  2000    Ohio   1.8     True
two    1.7  2001    Ohio   NaN     True
three  3.6  2002    Ohio   NaN     True
four   2.4  2001  Nevada   0.5    False
five   2.9  2002  Nevada   NaN    False
six    3.2  2003  Nevada   3.4    False


In [34]:
# del method is used to remove a column
del frame2['eastern']
print('frame2: \n',frame2)
print('columns: \n',frame2.columns)


frame2: 
        pop  year   state  debt
one    1.5  2000    Ohio   1.8
two    1.7  2001    Ohio   NaN
three  3.6  2002    Ohio   NaN
four   2.4  2001  Nevada   0.5
five   2.9  2002  Nevada   NaN
six    3.2  2003  Nevada   3.4
columns: 
 Index(['pop', 'year', 'state', 'debt'], dtype='object')


In [39]:
# nested dict can be passed for DataFrame
population_data = {
    'Nevada': {'2020': 155000, '2021': 160000},
    'Ohio': {'2019': 300000, '2020': 310000, '2021': 320000},
}
df3 = pd.DataFrame(population_data)
print('Dataframe: \n', df3)

# transpose can be used as well
print('\n\nDataFrame Transpose: \n',df3.T)

Dataframe: 
         Nevada    Ohio
2020  155000.0  310000
2021  160000.0  320000
2019       NaN  300000


DataFrame Transpose: 
             2020      2021      2019
Nevada  155000.0  160000.0       NaN
Ohio    310000.0  320000.0  300000.0


In [46]:
# by specifying the index, it will take the values from the dict
# and will create the missing values
df2 = pd.DataFrame(population_data, index=['2020','2021','2022'])
print('df2: \n',df2)

# we can name the index and the columns
df2.index.name = 'year'
df2.columns.name = 'state'

# df.values return the data contained in the dataframe
print('\nindex name: ',df2.index.name)
print('\nDataFrame values: \n',df2.values)

df2: 
         Nevada      Ohio
2020  155000.0  310000.0
2021  160000.0  320000.0
2022       NaN       NaN

index name:  year

DataFrame values: 
 [[155000. 310000.]
 [160000. 320000.]
 [    nan     nan]]
