# Pandas

In [14]:
import pandas as pd

## Series Data Structure

In [15]:
# think of a series as a cross between a list and a dictionary (indexed data structure)
# one of the easiest ways to create a string is to use an array like object
students = ['Yash', 'Aryan', 'Rohan']
pd.Series(students)
# the values are automatically indexed using integers and the datatype is automatically identified

0     Yash
1    Aryan
2    Rohan
dtype: object

In [16]:
# underneath, pandas stores series values in a typed array using the Numpy library
# eg
numbers = [1, 2, 3]
pd.Series(numbers) # dtype = int64

0    1
1    2
2    3
dtype: int64

In [17]:
# none type data - strings
students = ['Yash', 'Aryan', None]
print(pd.Series(students))

# none type data - numbers
numbers = [1, 2, None]
print(pd.Series(numbers)) # NaN - Not a Number
# dtype of this series is set to float64 because NaN is considered a floating point

# Nan is not equivalent to None, it is a numeric value

0     Yash
1    Aryan
2     None
dtype: object
0    1.0
1    2.0
2    NaN
dtype: float64


In [18]:
# creating series using dictionaries
subjects = {'Yash' : 'Physics',
            'Hrisheka' : 'Chemistry',
            'Rohan' : 'Mathematics'}

s = pd.Series(subjects)
print(s)

# retrieve a list of the key values
print(s.index)

Yash            Physics
Hrisheka      Chemistry
Rohan       Mathematics
dtype: object
Index(['Yash', 'Hrisheka', 'Rohan'], dtype='object')


In [19]:
# arbitrary data type
lst = [(1, 'apple'), (2, 'banana'), (3, 'cherry')]
print(pd.Series(lst)) 
# this dtype is also set to object

0     (1, apple)
1    (2, banana)
2    (3, cherry)
dtype: object


In [20]:
# we can also set the index of a list explicitly
x = pd.Series(['Physics', 'Chemisty', 'Math'], index = ['Alice', 'Jack', 'Molly'])
print(x)

Alice     Physics
Jack     Chemisty
Molly        Math
dtype: object


In [21]:
# eg - list of values in the index are not in line with keys in dictionary
subjects = {'Yash' : 'Physics',
            'Hrisheka' : 'Chemistry',
            'Rohan' : 'Mathematics'}

s = pd.Series(subjects, index = ['Yash', 'Hrisheka', 'Rajesh'])
print(s)
# pandas will override the automatic creation, and provide values for only the specified indices

Yash          Physics
Hrisheka    Chemistry
Rajesh            NaN
dtype: object


we have seen how series are created from lists and dictionaries. we have also seen how indices work and how data types works.

## Querying a Series

In [22]:
classes = {'Alice' : 'Physics',
           'Jack' : 'Chemistry',
           'Molly' : 'English',
           'Sam' : 'History'}
s = pd.Series(classes)
print(s)

# retrieve data using index number
print(s.iloc[3])

#retrieve data using dictionary key
print(s.loc['Molly'])

# iloc and loc are not methods, they are attributes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object
History
English


In [23]:
# we can use the indexing operator directly on the series itself
print(s[3]) # assumes s.iloc
print(s['Molly']) # assumes s.loc

History
English


  print(s[3]) # assumes s.iloc


In [24]:
# however, using iloc or loc are safer options, eg:
class_code = {99 : 'English',
              100 : 'Math',
              101 : 'Physics'}

s1 = pd.Series(class_code)
print(s1)
print(s1.iloc[0]) # here, the value inside [] will look for the keys, not the position, so we have to use ilo



99     English
100       Math
101    Physics
dtype: object
English


In [25]:
s1.iloc[0]

'English'

### Performing Operations on Series'

In [26]:
# we can perform operations on items in a series by iterating through all items in the series
grades = pd.Series([90, 60, 70, 80])

# find the average score
total = 0
for i in grades:
    total += i 
print(total / len(grades))

75.0


In [48]:
# we can also do this in a faster way, using the numpy library
import numpy as np
print(np.sum(grades) / len(grades))

75.0


In [28]:
# in order to test wich method is faster
numbers = pd.Series(np.random.randint(0, 1000, 10000)) # 10000 random integers between 0 nd 1000
print(numbers.head()) # prints first 5 elements
print(len(numbers))


0    174
1    477
2    407
3    186
4    978
dtype: int64
10000


In [29]:
%%timeit -n 100
# method 1
total = 0
for i in numbers:
    total += i
total / len(numbers)

649 μs ± 25.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%%timeit -n 100
# method 2
np.sum(numbers) / len(numbers)

11 μs ± 4.66 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [31]:
# we can perform arithmetic operations on pandas series
numbers += 2
print(numbers.head())

0    176
1    479
2    409
3    188
4    980
dtype: int64


In [32]:
# we can iterate though a pandas series much like a dictionary, using iteritems()
for label, value in numbers.items():
    numbers[label] = value + 2
numbers


0       178
1       481
2       411
3       190
4       982
       ... 
9995     74
9996    551
9997     33
9998    515
9999    120
Length: 10000, dtype: int64

In [33]:
%%timeit -n 10
# method 1
srs = pd.Series(np.random.randint(1, 1000, 10000))
for label, value in srs.items():
    srs[label] = value + 2

30.1 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
%%timeit -n 10
# method 2
srs = pd.Series(np.random.randint(1, 1000, 10000))
srs += 2
srs

# this method is more concise and easier to read

88.7 μs ± 39.7 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
# .loc attribute can be used to change data as well as create new data
s2 = pd.Series([1, 2, 3])
print(s2)
s2.loc[2] = 5
print(s2)
s2.loc['History'] = 'Yash'
print(s2)

0    1
1    2
2    3
dtype: int64
0    1
1    2
2    5
dtype: int64
0             1
1             2
2             5
History    Yash
dtype: object


In [47]:
# we can also have a case where the index values are not unique, eg:
classes = pd.Series({'Yash' : 'English',
                     'Hrisheka' : 'French',
                     'Rahul' : 'Geography'})
print(classes)

# new student joins
dev_classes = pd.Series(['Philosophy', 'History', 'Art'], index = ['Dev', 'Dev', 'Dev'])
print(dev_classes)

# we can combine the two dictionaries
all_classes = pd.concat([classes, dev_classes])
print(all_classes)

# now we use the loc attribute on Dev
print(all_classes.loc['Dev']) # we get a series of values as our output


Yash          English
Hrisheka       French
Rahul       Geography
dtype: object
Dev    Philosophy
Dev       History
Dev           Art
dtype: object
Yash           English
Hrisheka        French
Rahul        Geography
Dev         Philosophy
Dev            History
Dev                Art
dtype: object


Dev    Philosophy
Dev       History
Dev           Art
dtype: object