# Data Frame - Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes
Arithmetic operations align on both row and column labels. 

Can be thought of as a dict-like container for Series objects. 

The primary pandas data structure.

In [13]:
import pandas as pd
import numpy as np

In [25]:
# create a dictionary
dict1 ={'Parent':['Pepsi','Coke','Nike','Pepsi'], 
        'Child':['Diet Pepsi 12oz.', 'Coke lemon Flavor 16oz.', 'Nike cool running shoes', 'Pepsi 16oz']}

In [26]:
# create a DataFrame from a dictionary
dataframe1 =pd.DataFrame(dict1)
dataframe1

Unnamed: 0,Child,Parent
0,Diet Pepsi 12oz.,Pepsi
1,Coke lemon Flavor 16oz.,Coke
2,Nike cool running shoes,Nike
3,Pepsi 16oz,Pepsi


In [10]:
dataframe1.dtypes 

Child     object
Parent    object
dtype: object

In [16]:
# create a DataFrame from a dictionary and enforcing a String data Type
dataframe1 =pd.DataFrame(dict1, dtype=np.str)
dataframe1

Unnamed: 0,Child,Parent
0,Diet Pepsi 12oz.,Pepsi
1,Coke lemon Flavor 16oz.,Coke
2,Nike cool ruuning shoes,Nike
3,Pepsi 16oz,Pepsi


In [17]:
dataframe1.dtypes 


Child     object
Parent    object
dtype: object

In [30]:
parent=dataframe1.groupby('Parent').count() # grouping the dataframe by parent and doinga count

In [29]:
parent

Unnamed: 0_level_0,Child
Parent,Unnamed: 1_level_1
Coke,1
Nike,1
Pepsi,2


In [31]:
dataframe1['Child'] # displays child  column in the Dataframe

0           Diet Pepsi 12oz.
1    Coke lemon Flavor 16oz.
2    Nike cool running shoes
3                 Pepsi 16oz
Name: Child, dtype: object

In [32]:
dataframe1['Parent']  # displays child  column in the Dataframe

0    Pepsi
1     Coke
2     Nike
3    Pepsi
Name: Parent, dtype: object

In [35]:
dataframe1[:2] # displaying selective rows. will display only 0 and 1 rows

Unnamed: 0,Child,Parent
0,Diet Pepsi 12oz.,Pepsi
1,Coke lemon Flavor 16oz.,Coke


In [37]:
#reading Flights.csv
flights = pd.read_csv('flights.csv')
flights.head(3)

Unnamed: 0,year,month,passengers,Unnamed: 3
0,1949,January,112,
1,1949,February,118,
2,1949,March,132,


In [43]:
# deleting the last Unnamed column
# if not assigned to flights then the dataframe will have the deleted column
flights=flights.drop(['Unnamed: 3'], axis=1)


In [44]:
flights.head(3)

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132


In [45]:
flights.groupby(['year']).sum()  # grouping by Year to get the total flights for the year

Unnamed: 0_level_0,passengers
year,Unnamed: 1_level_1
1949,1520
1950,1676
1951,2042
1952,2364
1953,2700
1954,2867
1955,475


In [49]:
# get selected columns and selected rows
flights['passengers'][:5]

0    112
1    118
2    132
3    129
4    121
Name: passengers, dtype: int64

In [50]:
 # get selected columns and selected rows and order of rows and columns does not matter
flights[:5]['passengers'] 

0    112
1    118
2    132
3    129
4    121
Name: passengers, dtype: int64

In [53]:
# select multiple columns and selected rows
flights[:5][[ 'year', 'passengers'] ]

Unnamed: 0,year,passengers
0,1949,112
1,1949,118
2,1949,132
3,1949,129
4,1949,121


In [85]:
month_count=flights.groupby(['month']).sum()
month_count = month_count[:][['passengers']]
month_count

Unnamed: 0_level_0,passengers
month,Unnamed: 1_level_1
April,1070
August,1324
December,1048
February,1191
January,1185
July,1313
June,1187
March,1115
May,1064
November,919


In [86]:
month_count.sort_values('passengers') # sort by passenger count

Unnamed: 0_level_0,passengers
month,Unnamed: 1_level_1
November,919
October,1045
December,1048
May,1064
April,1070
March,1115
September,1183
January,1185
June,1187
February,1191


In [87]:
month_count.sort_values('passengers', ascending=False) # sort by passenger count from highest to lowest

Unnamed: 0_level_0,passengers
month,Unnamed: 1_level_1
August,1324
July,1313
February,1191
June,1187
January,1185
September,1183
March,1115
April,1070
May,1064
December,1048


In [98]:
pass_150= flights[flights.passengers > 150] # gets flights where passenger count >150
pass_150.head(3)

Unnamed: 0,year,month,passengers
18,1950,July,170
19,1950,August,170
20,1950,September,158


In [101]:
# to retrieve passenger count for the month of August
pass_aug = flights[flights.month == 'August']
pass_aug.sort_values('passengers', ascending=False)

Unnamed: 0,year,month,passengers
67,1954,August,293
55,1953,August,272
43,1952,August,242
31,1951,August,199
19,1950,August,170
7,1949,August,148


In [100]:
pass_aug

Unnamed: 0,year,month,passengers
7,1949,August,148
19,1950,August,170
31,1951,August,199
43,1952,August,242
55,1953,August,272
67,1954,August,293
