# Introduction to Pandas (Python and Data Analysis)

We start with understanding the difference between "Series" and "DataFrame"

- A series is a single column of data
- A dataframe is the entire sheet of data

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
x = Series([30,40,50]) # This gives the index (first column) and the values (subsequent columns)
x

0    30
1    40
2    50
dtype: int64

In [4]:
x.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
x.values

array([30, 40, 50], dtype=int64)

## Creating Series with Index 

By default the index is a range starting from 0. If necessary, one can specify the index.

In [6]:
sales = Series([45000,65000,87000], index=["Don", "Mike", "Edward"])
sales

Don       45000
Mike      65000
Edward    87000
dtype: int64

In [7]:
type(sales)

pandas.core.series.Series

### Checking for a specific value

In [9]:
sales["Don"]

45000

### Checking for conditions

In [10]:
sales[sales>50000]

Mike      65000
Edward    87000
dtype: int64

### Checking for existence of a value by key

In [11]:
"Don" in sales

True

In [12]:
"John" in sales

False

### Converting series to dictionaries

In [13]:
sales_dict = sales.to_dict()
sales_dict

{'Don': 45000, 'Mike': 65000, 'Edward': 87000}

### Converting dictionaries to series

In [16]:
sales_series = Series(sales_dict)
sales_series

Don       45000
Mike      65000
Edward    87000
dtype: int64

### Calling dictionaries into series

In [17]:
new = ["Don", "Mike", "Edward", "John"]

In [18]:
ssales = Series(sales_dict, index=new)
ssales

Don       45000.0
Mike      65000.0
Edward    87000.0
John          NaN
dtype: float64

### Finding Null values

In [19]:
pd.isnull(ssales)

Don       False
Mike      False
Edward    False
John       True
dtype: bool

### Adding values in series

In [22]:
ssales = sales + ssales

### Naming a series

In [23]:
ssales.name = "Total Sales"

In [24]:
ssales

Don       44999.0
Edward    86999.0
John          NaN
Mike      64999.0
Name: Total Sales, dtype: float64

### Naming an index

In [25]:
ssales.index.name = "Sales Person"
ssales

Sales Person
Don       44999.0
Edward    86999.0
John          NaN
Mike      64999.0
Name: Total Sales, dtype: float64

## Creating a DataFrame

### Creating a DataFrame from a list

In [26]:
import pandas as pd

In [27]:
data = [["Adrian", 20], ["Beatrice", 32], ["Chloe", 41]]
df = pd.DataFrame(data,columns = ["Name", "Age"], dtype = int)
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Beatrice,32
2,Chloe,41


We specify the data type as an integer. We could give it as a float, but that would not make much sense for an age value. Each age value will then be given as 20.0 etc. 

### Creating a DataFrame from dictionaries - using the default index

In [28]:
new = {"Name": ["Tom", "Jack", "Steve", "Ricky"], "Sales": [25000,30000,35000,40000]}
df2 = pd.DataFrame(new)
df2

Unnamed: 0,Name,Sales
0,Tom,25000
1,Jack,30000
2,Steve,35000
3,Ricky,40000


### - Now using a specified index 

In [29]:
new2 = {"Name": ["Tom", "Jack", "Steve", "Ricky"], "Sales": [25000,30000,35000,40000]}
df3 = pd.DataFrame(new2, index=["rank1", "rank2", "rank3", "rank4"])
df3

Unnamed: 0,Name,Sales
rank1,Tom,25000
rank2,Jack,30000
rank3,Steve,35000
rank4,Ricky,40000


### Creating a DataFrame from a list of dictionaries (without passing an index value)

In [30]:
data = [{"a":1, "b":2}, {"a":5,"b":10, "c":15}]
df4 = pd.DataFrame(data)
df4

Unnamed: 0,a,b,c
0,1,2,
1,5,10,15.0


### Same as above but passing index values

In [35]:
data2 = [{"East":15000, "West":20000}, {"East":5000,"West":10500, "South":20000}]
df5 = pd.DataFrame(data2, index=["Sales1", "Sales2"], columns=["East", "West"])
df5

Unnamed: 0,East,West
Sales1,15000,20000
Sales2,5000,10500


In [36]:
df6 = pd.DataFrame(data2, index=["Sales1", "Sales2"], columns=["East", "South"])
df6

Unnamed: 0,East,South
Sales1,15000,
Sales2,5000,20000.0


In [37]:
df7 = pd.DataFrame(data2, index=["Sales1", "Sales2"], columns=["East", "West", "South"])
df7

Unnamed: 0,East,West,South
Sales1,15000,20000,
Sales2,5000,10500,20000.0


### Creating DataFrame from dictionary of series

In [38]:
values = {"East": pd.Series([10000,20000, 30000],index = ["Q1", "Q2", "Q3"]),
            "West": pd.Series([15000,25000,35000,45000], index = ["Q1", "Q2", "Q3", "Q4"])}

salesdf = pd.DataFrame(values)
salesdf

Unnamed: 0,East,West
Q1,10000.0,15000
Q2,20000.0,25000
Q3,30000.0,35000
Q4,,45000


### Adding columns to DataFrame

In [42]:
salesdf["South"]=pd.Series([17000,27000,37000],index=["Q1", "Q2", "Q3"])
salesdf

Unnamed: 0,East,West,South
Q1,10000.0,15000,17000.0
Q2,20000.0,25000,27000.0
Q3,30000.0,35000,37000.0
Q4,,45000,


### Adding values into the DataFrame

In [44]:
salesdf["North"] = salesdf["East"]+salesdf["West"]
salesdf # Note that adding any valid value to an NaN value will always result in a NaN value

Unnamed: 0,East,West,South,North
Q1,10000.0,15000,17000.0,25000.0
Q2,20000.0,25000,27000.0,45000.0
Q3,30000.0,35000,37000.0,65000.0
Q4,,45000,,


### Indexing and reindexing objects

In [45]:
sales = Series([45000,65000,87000], index=["Don", "Mike", "Edwin"])
sales

Don      45000
Mike     65000
Edwin    87000
dtype: int64

In [47]:
salesreindexed = sales.reindex(["Don", "Luke", "Edwin"])
salesreindexed # Will change the index but the value will not be carried across

Don      45000.0
Luke         NaN
Edwin    87000.0
dtype: float64

### Replacing Null values with zeros

In [48]:
salesreindexed = sales.reindex(["Don", "Luke", "Edwin"],fill_value=0)
salesreindexed

Don      45000
Luke         0
Edwin    87000
dtype: int64

### Indexing and reindexing DataFrames

In [49]:
data = {"county": ["croydon", "cornwall", "hampshire"],
       "year":[2011,2013,2014],
       "sales":[20000,35000,45000]}
sales_df = pd.DataFrame(data)
sales_df

Unnamed: 0,county,year,sales
0,croydon,2011,20000
1,cornwall,2013,35000
2,hampshire,2014,45000


In [53]:
sales_df.reindex([2,1,0]) #This is not a permenant change so if I call sales_df it will be the same as before

Unnamed: 0,county,year,sales
2,hampshire,2014,45000
1,cornwall,2013,35000
0,croydon,2011,20000


In [54]:
sales_df

Unnamed: 0,county,year,sales
0,croydon,2011,20000
1,cornwall,2013,35000
2,hampshire,2014,45000


In [56]:
sales_df.reindex([20,30,40]) # This returns no number values since we have not assigned a value to each of these new indices

Unnamed: 0,county,year,sales
20,,,
30,,,
40,,,


### Adding Column Titles

In [58]:
columntitles= ["year", "sales", "county"]
sales_df.reindex(columns=columntitles) # Calling existing titles in the reindex function shall rearrange the columns
# to match the given order

Unnamed: 0,year,sales,county
0,2011,20000,croydon
1,2013,35000,cornwall
2,2014,45000,hampshire


### Dropping index in series

In [59]:
ser1 = Series(np.arange(3), index=("aa", "bb", "cc"))
ser1

aa    0
bb    1
cc    2
dtype: int32

In [60]:
ser1.drop("cc")

aa    0
bb    1
dtype: int32

### Dropping index in DataFrame

In [63]:
sales_df = DataFrame(np.arange(9).reshape(3,3),index=["SF", "NYC", "BO"],
                   columns=["country","region", "sale"])
sales_df

Unnamed: 0,country,region,sale
SF,0,1,2
NYC,3,4,5
BO,6,7,8


In [64]:
sales_df.drop("SF")

Unnamed: 0,country,region,sale
NYC,3,4,5
BO,6,7,8


### Selecting entries

In [68]:
ser1 = Series(np.arange(3), index=["AA", "BB", "CC"])
ser1

AA    0
BB    1
CC    2
dtype: int32

In [69]:
ser1 = 2*ser1
ser1

AA    0
BB    2
CC    4
dtype: int32

In [70]:
ser1["BB"]

2

In [71]:
ser1[1]

2

In [73]:
ser1[0:2]

AA    0
BB    2
dtype: int32

In [74]:
ser1[["AA", "BB"]]

AA    0
BB    2
dtype: int32

In [75]:
ser1[ser1>2]

CC    4
dtype: int32

In [76]:
ser1[ser1>2] =20
ser1

AA     0
BB     2
CC    20
dtype: int32

### Selecting entries in DataFrame

In [80]:
data = {"county": ["croydon", "cornwall", "cumbria", "durham", "hampshire"],
       "year": ["2012", "2012", "2013", "2014", "2014"],
       "sales": [45000,24000,31000,20000,30000]}
sales_df = pd.DataFrame(data)
sales_df

Unnamed: 0,county,year,sales
0,croydon,2012,45000
1,cornwall,2012,24000
2,cumbria,2013,31000
3,durham,2014,20000
4,hampshire,2014,30000


### Extracting from a specific location

In [83]:
sales_df.iloc[4] # "index location" returns all data associated with the given index

county    hampshire
year           2014
sales         30000
Name: 4, dtype: object

### Extracting a specific column

In [84]:
sales_df["year"]

0    2012
1    2012
2    2013
3    2014
4    2014
Name: year, dtype: object

In [88]:
sales_df[["year","sales"]]

Unnamed: 0,year,sales
0,2012,45000
1,2012,24000
2,2013,31000
3,2014,20000
4,2014,30000


In [89]:
sales_df[sales_df["sales"]>24000]

Unnamed: 0,county,year,sales
0,croydon,2012,45000
2,cumbria,2013,31000
4,hampshire,2014,30000


In [90]:
sales_df["sales"]>24000

0     True
1    False
2     True
3    False
4     True
Name: sales, dtype: bool

Ensure you understand the difference between the two previous operations, because they may look similar but return very different results

In [91]:
sales_df.loc[2]

county    cumbria
year         2013
sales       31000
Name: 2, dtype: object

## Data Alignment
### Data Alignment in Series

In [92]:
sales = Series([150000,250000,340000], index=["East","West","South"])
sales

East     150000
West     250000
South    340000
dtype: int64

In [93]:
sales1= Series([200000,300000,450000,340000], index=["East","West","South","North"])
sales1

East     200000
West     300000
South    450000
North    340000
dtype: int64

In [94]:
sales+sales1

East     350000.0
North         NaN
South    790000.0
West     550000.0
dtype: float64

In [96]:
sales.add(sales1,fill_value=0) # This method avoids any issues caused by the NaN value of the North value

East     350000.0
North    340000.0
South    790000.0
West     550000.0
dtype: float64

### Data Alignment in DataFrames

In [99]:
sales_df = pd.DataFrame(data)
sales_df

Unnamed: 0,county,year,sales
0,croydon,2012,45000
1,cornwall,2012,24000
2,cumbria,2013,31000
3,durham,2014,20000
4,hampshire,2014,30000


In [103]:
data2 = {"county":["croydon","cornwall","cumbria"],
        "year":[2012,2012,2013],
        "sales":[45000,24000,31000]}
sales1_df = pd.DataFrame(data2)
sales1_df

Unnamed: 0,county,year,sales
0,croydon,2012,45000
1,cornwall,2012,24000
2,cumbria,2013,31000


In [104]:
sales_df["sales"] + sales1_df["sales"]

0    90000.0
1    48000.0
2    62000.0
3        NaN
4        NaN
Name: sales, dtype: float64

In [105]:
sales_df["sales"].add(sales1_df["sales"],fill_value=0)

0    90000.0
1    48000.0
2    62000.0
3    20000.0
4    30000.0
Name: sales, dtype: float64

### Sorting and Ranking

In [106]:
sales = Series(("35000", "40000", "22400", "45000"),index=["East","West","North","South"])
sales

East     35000
West     40000
North    22400
South    45000
dtype: object

In [107]:
sales.sort_index(ascending=1) # Index is being sorted in alphabetical order

East     35000
North    22400
South    45000
West     40000
dtype: object

In [110]:
sales.sort_index(ascending=0) # Index is being sorted in reverse alphabetical order

West     40000
South    45000
North    22400
East     35000
dtype: object

In [111]:
sales.rank(ascending=0) # 1 is the largest value

East     3.0
West     2.0
North    4.0
South    1.0
dtype: float64

In [113]:
sales.rank(ascending=1) # 1 is the smallest value

East     2.0
West     3.0
North    1.0
South    4.0
dtype: float64

### Sorting and Ranking in DataFrames

In [115]:
sales_df.sort_index(ascending=1)

Unnamed: 0,county,year,sales
0,croydon,2012,45000
1,cornwall,2012,24000
2,cumbria,2013,31000
3,durham,2014,20000
4,hampshire,2014,30000


In [116]:
sales_df.sort_index(ascending=0)

Unnamed: 0,county,year,sales
4,hampshire,2014,30000
3,durham,2014,20000
2,cumbria,2013,31000
1,cornwall,2012,24000
0,croydon,2012,45000


In [117]:
sales_df.sort_values(by=["sales"],ascending=1)

Unnamed: 0,county,year,sales
3,durham,2014,20000
1,cornwall,2012,24000
4,hampshire,2014,30000
2,cumbria,2013,31000
0,croydon,2012,45000


In [118]:
sales_df.sort_values(by=["county"],ascending=0)

Unnamed: 0,county,year,sales
4,hampshire,2014,30000
3,durham,2014,20000
2,cumbria,2013,31000
0,croydon,2012,45000
1,cornwall,2012,24000


In [120]:
sales_df["salesranked"] = sales_df["sales"].rank(ascending=0)
sales_df

Unnamed: 0,county,year,sales,salesranked
0,croydon,2012,45000,1.0
1,cornwall,2012,24000,4.0
2,cumbria,2013,31000,2.0
3,durham,2014,20000,5.0
4,hampshire,2014,30000,3.0


## Summary Statistics

In [122]:
arr = np.array([[1,2,np.NaN],[np.NaN,3,4]])
dframe = DataFrame(arr,index=["A","B"],columns=["one","two","three"]) # "pd." is optional when calling a DataFrame
dframe

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0


In [123]:
dframe.sum()

one      1.0
two      5.0
three    4.0
dtype: float64

In [124]:
dframe.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [125]:
dframe.min() # minimum values in each column

one      1.0
two      2.0
three    4.0
dtype: float64

In [127]:
dframe.min(axis=1)

A    1.0
B    3.0
dtype: float64

In [129]:
dframe.cumsum() # adds the values as you go down each row of the DataFrame

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,5.0,4.0


In [133]:
dframe.describe()

Unnamed: 0,one,two,three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0


### Finding unique values in a series

In [134]:
sales = Series(["25000","20000","25000","25000","24000"])
sales

0    25000
1    20000
2    25000
3    25000
4    24000
dtype: object

In [135]:
sales.unique()

array(['25000', '20000', '24000'], dtype=object)

### Finding occurances of a particular value

In [138]:
sales.value_counts()["25000"]

3

In [139]:
data = Series(["one","two",np.NaN,"four"])
data

0     one
1     two
2     NaN
3    four
dtype: object

In [140]:
clean_dframe = dframe.dropna()

In [141]:
clean_dframe

Unnamed: 0,one,two,three


In [142]:
dframe

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0


In [144]:
dframe.dropna(axis=1,how="all") # shall drop any rows where all three entries are NaN

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0
