# Intro to Pandas

In [3]:
import numpy as np
import pandas as pd

In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Series
___

In [12]:
# Creating from list
s1 = pd.Series([1,2,3,4,5], index=["a","b","c","d","e"])
s1
s1.index
s1.values

a    1
b    2
c    3
d    4
e    5
dtype: int64

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

array([1, 2, 3, 4, 5])

In [11]:
# creating from ndarray
s2 = pd.Series(np.random.randn(5))
s2
s2.index
s2.values

0   -1.691969
1    0.788278
2   -0.604017
3   -0.623228
4    0.240437
dtype: float64

RangeIndex(start=0, stop=5, step=1)

array([-1.69196863,  0.78827848, -0.60401696, -0.62322802,  0.24043686])

In [13]:
# creating from dictionary
dict1 = {"a":1, "b":2, "c":3}
s3 = pd.Series(dict1)
s3
type(s3)

a    1
b    2
c    3
dtype: int64

pandas.core.series.Series

In [14]:
# From a scalar value
# Index parameter must be passed in this case
# The scalar value is repeated for each index
s4 = pd.Series(5, index=["a", "b", "c"])
s4

a    5
b    5
c    5
dtype: int64

> Pandas Series is very similar to Ndarrays
>
> Almost always can pass a Series to Numpy functions

### Series allows for slicing and filtering with boolean arrays

In [17]:
s1
s1[1:4]

a    1
b    2
c    3
d    4
e    5
dtype: int64

b    2
c    3
d    4
dtype: int64

In [18]:
s1[[1,3,4]]

b    2
d    4
e    5
dtype: int64

In [19]:
s1[s1 > 3]

d    4
e    5
dtype: int64

In [21]:
# passing Series to a Numpy function
# np.exp caluculates e^x where x is each value in the series. i.e. 2.71^x
np.exp(s1)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
dtype: float64

---

In [22]:
s1.dtype

dtype('int64')

In [29]:
s1
type(s1)

s1 = s1.to_numpy()
s1
type(s1)

s1 = pd.Series(s1)

0    1
1    2
2    3
3    4
4    5
dtype: int64

pandas.core.series.Series

array([1, 2, 3, 4, 5])

numpy.ndarray

In [33]:
s4 = pd.Series([1,2,3])
s4

s4[0]

s4[0] = 0
s4

s4[0:] = range(3)
s4

s4[3] = 4
s4

0    1
1    2
2    3
dtype: int64

1

0    0
1    2
2    3
dtype: int64

0    0
1    1
2    2
dtype: int64

0    0
1    1
2    2
3    4
dtype: int64

### Series automatically align data based on label

> In the following example, The value at index "a" in the first series is added to the value at index "a" in the second

In [38]:
s5 = pd.Series([1,2,3,4,5], index=["a","b","c","d","e"])
s6 = pd.Series([1,2,3,4,5], index=["e","d","c","b","a"])
s5
s6

a    1
b    2
c    3
d    4
e    5
dtype: int64

e    1
d    2
c    3
b    4
a    5
dtype: int64

In [39]:
s5 + s6

a    6
b    6
c    6
d    6
e    6
dtype: int64

In [43]:
s7 = pd.Series([1,2,3], index=["a","b","c"], name="example")
s7

s7.name

a    1
b    2
c    3
Name: example, dtype: int64

'example'

# Dataframe
___

In [49]:
# creating from dictionary of Series
d = {"one":s1, "two":s2, "three":s4}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two,three
0,1,1.0,0.0
1,2,2.0,1.0
2,3,3.0,2.0
3,4,4.0,4.0
4,5,,


**Note that above the index of the dataframe is the union of the indices of the Series**

> Also if you choose to pass index or columns parameters, that guarantees the index and columns of the dataframe,
> And any information that does not match the indices will be discarded

In [52]:
# creating from list or ndarray
d = {"one":[1,2,3], "two":[4,5,6], "three":[7,8,9]}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two,three
0,1,4,7
1,2,5,8
2,3,6,9


**Note that above the lists must be the same length. and if you specify an index, it must be the same length as well**

### Column addition & deletion

In [53]:
df

Unnamed: 0,one,two,three
0,1,4,7
1,2,5,8
2,3,6,9


In [54]:
# Adding a column
df["four"] = [10,11,12]
df

Unnamed: 0,one,two,three,four
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [55]:
# Boolean
df["yo"] = df["four"].eq(11)
df

Unnamed: 0,one,two,three,four,yo
0,1,4,7,10,False
1,2,5,8,11,True
2,3,6,9,12,False


In [56]:
# deleting a column
del df["yo"]
df

Unnamed: 0,one,two,three,four
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [57]:
# with scalar value
df["five"] = 5
df

Unnamed: 0,one,two,three,four,five
0,1,4,7,10,5
1,2,5,8,11,5
2,3,6,9,12,5


# Dtypes
___

Pandas supports all numpy dtypes as well as some non-numerical types

![image](pandasdtypes.png)

In [60]:
dft = pd.DataFrame({'A': np.random.rand(3),
                        'B': 1,
                        'C': 'foo',
                        'D': pd.Timestamp('20010102'),
                        'E': pd.Series([1.0] * 3).astype('float32'),
                        'F': False,
                        'G': pd.Series([1] * 3, dtype='int8')})
dft

dft.dtypes

Unnamed: 0,A,B,C,D,E,F,G
0,0.907861,1,foo,2001-01-02,1.0,False,1
1,0.134266,1,foo,2001-01-02,1.0,False,1
2,0.509748,1,foo,2001-01-02,1.0,False,1


A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

> Pandas supports string dtype
>
> But by default string passed to a dataframe or series will take "object" dtype
>
> It is recommended to change to string dtype

In [61]:
# Converting object dtype to string dtype
dft["C"] = dft["C"].astype("string")
dft

dft.dtypes

Unnamed: 0,A,B,C,D,E,F,G
0,0.907861,1,foo,2001-01-02,1.0,False,1
1,0.134266,1,foo,2001-01-02,1.0,False,1
2,0.509748,1,foo,2001-01-02,1.0,False,1


A           float64
B             int64
C            string
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

> Note: You can pass a dictionary to the .astype method to convert the data types of multiple columns at once

# Pandas Basics 1
___

In [66]:
df = pd.DataFrame(np.random.randn(8, 3), index=["row1","row2","row3","row4","row5","row6","row7","row8"],
                      columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
row1,-0.059903,-0.916987,0.044637
row2,-1.562155,0.690908,0.668948
row3,-0.559378,0.661831,-0.512476
row4,0.885075,-0.924272,-0.678083
row5,0.31327,-2.290299,-0.194556
row6,1.856932,1.081799,-0.322448
row7,1.306888,0.742093,0.350887
row8,-0.085958,0.082599,0.581337


### Attributes

In [82]:
df.shape 
df.index
df.columns

(8, 3)

Index(['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8'], dtype='object')

Index(['a', 'b', 'c'], dtype='object')

In [67]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
row1,-0.059903,-0.916987,0.044637
row2,-1.562155,0.690908,0.668948
row3,-0.559378,0.661831,-0.512476
row4,0.885075,-0.924272,-0.678083
row5,0.31327,-2.290299,-0.194556
row6,1.856932,1.081799,-0.322448
row7,1.306888,0.742093,0.350887
row8,-0.085958,0.082599,0.581337


In [83]:
# Getting the raw data out of a column
df["a"].array

df.index.array

df.columns.array

<PandasArray>
[-0.05990330062299127,  -1.5621548429546828,  -0.5593775892766457,
    0.885074507598106,   0.3132697807421941,   1.8569319190102094,
   1.3068878697929138, -0.08595764415314203]
Length: 8, dtype: float64

<PandasArray>
['row1', 'row2', 'row3', 'row4', 'row5', 'row6', 'row7', 'row8']
Length: 8, dtype: object

<PandasArray>
['a', 'b', 'c']
Length: 3, dtype: object

### Counting

In [84]:
data = np.random.randint(1, 7, size=50)
data

array([4, 2, 4, 6, 6, 3, 6, 2, 1, 2, 1, 3, 6, 5, 3, 5, 6, 1, 6, 1, 2, 6,
       5, 5, 5, 3, 2, 2, 2, 1, 5, 4, 5, 4, 1, 4, 6, 4, 1, 4, 6, 2, 4, 1,
       1, 2, 2, 4, 5, 1])

In [88]:
dice_rolls = pd.Series(data)
dice_rolls.value_counts()
dice_rolls.mode()  # 1 and 2 are tied for first place so the result is a series

2    10
1    10
6     9
4     9
5     8
3     4
dtype: int64

0    1
1    2
dtype: int64

> `.value_counts` can only be called on Series
>
> `.mode()` can be called on Series or DF

### Altering Labels

`.reindex()`

- changes the order of rows or columns

In [107]:
dice_rolls2 = pd.Series(np.random.randint(1,7, size=6), index=["roll1","roll2","roll3","roll4","roll5","roll6"])
dice_rolls2

roll1    5
roll2    2
roll3    1
roll4    3
roll5    5
roll6    4
dtype: int64

In [108]:
# reversing the order
dice_rolls2.reindex(dice_rolls2.index.array[::-1])

roll6    4
roll5    5
roll4    3
roll3    1
roll2    2
roll1    5
dtype: int64

### Changing the order of indices and columns in a dataframe

In [109]:
df = pd.DataFrame({
     'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
     'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
     'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.280607,-0.200081,
b,0.299217,0.737414,-0.622432
c,1.236053,-0.039672,-0.540458
d,,-0.116184,0.217293


In [112]:
df = df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])
df

Unnamed: 0,three,two,one
c,-0.540458,-0.039672,1.236053
f,,,
b,-0.622432,0.737414,0.299217


### Dropping rows and columns

Using `df.drop()`

In [121]:
df

Unnamed: 0,three,two,one
c,-0.540458,-0.039672,1.236053
f,,,
b,-0.622432,0.737414,0.299217


In [122]:
df = df.drop("one", axis="columns")
df

Unnamed: 0,three,two
c,-0.540458,-0.039672
f,,
b,-0.622432,0.737414


In [123]:
df = df.drop("b", axis=0)
df

Unnamed: 0,three,two
c,-0.540458,-0.039672
f,,


`rename()`

In [124]:
df = df.rename(str.capitalize, axis=1)
df

Unnamed: 0,Three,Two
c,-0.540458,-0.039672
f,,


In [129]:
df = df.rename(columns={"Three": "One", "Two": "Two"},
              index={"c": "A", "f": "B"})
df

Unnamed: 0,One,Two
A,-0.540458,-0.039672
B,,


### .dt Accessor

In [145]:
s = pd.Series(pd.date_range('20210424 09:10:12', periods=4))
s

0   2021-04-24 09:10:12
1   2021-04-25 09:10:12
2   2021-04-26 09:10:12
3   2021-04-27 09:10:12
dtype: datetime64[ns]

In [146]:
time = s[0]
print(time.year, time.month, time.day, time.hour, time.minute, time.second, time.dayofweek)

2021 4 24 9 10 12 5


Using timezone

In [147]:
stz = s.dt.tz_localize('US/Eastern')
stz

0   2021-04-24 09:10:12-04:00
1   2021-04-25 09:10:12-04:00
2   2021-04-26 09:10:12-04:00
3   2021-04-27 09:10:12-04:00
dtype: datetime64[ns, US/Eastern]

In [148]:
stz.dt.tz_convert('US/Pacific')

0   2021-04-24 06:10:12-07:00
1   2021-04-25 06:10:12-07:00
2   2021-04-26 06:10:12-07:00
3   2021-04-27 06:10:12-07:00
dtype: datetime64[ns, US/Pacific]

### .str Accessor

In [149]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                  dtype="string")
s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [150]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

## Sorting

By index

In [151]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.528033,-0.426476,
b,-1.585943,0.15773,-0.469036
c,1.551581,0.743612,-2.036175
d,,-2.1273,-1.683829


In [154]:
 unsorted_df = df.reindex(index=['a', 'c', 'd', 'b'],
                          columns=['three', 'two', 'one'])
unsorted_df

Unnamed: 0,three,two,one
a,,-0.426476,-0.528033
c,-2.036175,0.743612,1.551581
d,-1.683829,-2.1273,
b,-0.469036,0.15773,-1.585943


In [155]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,-0.426476,-0.528033
b,-0.469036,0.15773,-1.585943
c,-2.036175,0.743612,1.551581
d,-1.683829,-2.1273,


In [156]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.528033,,-0.426476
c,1.551581,-2.036175,0.743612
d,,-1.683829,-2.1273
b,-1.585943,-0.469036,0.15773


By values

In [158]:
 df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                        'two': [1, 3, 2, 4],
                        'three': [5, 4, 3, 2]})
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [161]:
df1.sort_values(by="two", ascending=False)

Unnamed: 0,one,two,three
3,1,4,2
1,1,3,4
2,1,2,3
0,2,1,5


In [162]:
df1.sort_values(by=["one", "two"])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


> NA values will appear at the bottom
>
> `na_position="first"` will put them at the top