## Pandas Data Structure
<li> Series </li>
<li> DataFrames </li>

### The Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([12,9,7,10,-13]) #or pd.Series((12,9,7,10,-13))
s
#notice that index values in series is same as index of the array/list of the input elements.

0    12
1     9
2     7
3    10
4   -13
dtype: int64

In [3]:
#Changing the index names.
s.index = ["a","b","c","d","e"]
print(s)

a    12
b     9
c     7
d    10
e   -13
dtype: int64


Series Attributes
<li> values
<li> index

In [4]:
s.values

array([ 12,   9,   7,  10, -13], dtype=int64)

In [5]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

<b><i>Methods of Extracting elements from Series.</i></b>

In [6]:
s[2] #Just like in arrays

7

In [7]:
s["c"] #extracting element using index name.

7

In [8]:
s[0:3] #extracting elements from 0 to 3 using array notation.

a    12
b     9
c     7
dtype: int64

In [9]:
s["a":"c"] #same as above, using index names.

a    12
b     9
c     7
dtype: int64

In [10]:
s[["a","e"]] #extracting elements with index 'a' and 'e'

a    12
e   -13
dtype: int64

<i><b>Editing Values</b></i>

In [11]:
print("Originally s:")
print(s)

print() #leave a line

s[0] = 23 #editing the first value from 12 to 23

s['c':'e'] = [15,-90,-21] #editing multiple values, simultaneously. 

print("After editing, s: ")
print(s)

Originally s:
a    12
b     9
c     7
d    10
e   -13
dtype: int64

After editing, s: 
a    23
b     9
c    15
d   -90
e   -21
dtype: int64


<i><b>Other ways of defining series </b></i>

In [12]:
arr = np.array([1,2,3,4,5],dtype = 'int64') #first define an array.

In [13]:
s1 = pd.Series(arr) #cast the array into series
print(s1)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [14]:
s2 = pd.Series(s) #Cast series into series

In [15]:
print(s)

a    23
b     9
c    15
d   -90
e   -21
dtype: int64


In [16]:
tup = ("A","B","C","D")
s3 = pd.Series(tup,index = ('a','b','c','d'))
print(s3)

a    A
b    B
c    C
d    D
dtype: object


In [17]:
#change in array, results in the change in Series
arr[1] = 10
s1

0     1
1    10
2     3
3     4
4     5
dtype: int64

<b><i> Filtering Values </i></b>

In [18]:
s1 < 5 #returns true for values, less than 5, and false otherwise 

0     True
1    False
2     True
3     True
4    False
dtype: bool

In [19]:
s1[s1<5] #returns the values, which are less than 5

0    1
2    3
3    4
dtype: int64

In [20]:
s1[s1>3]

1    10
3     4
4     5
dtype: int64

<i><b> Operations and Mathematical functions on Series </b></i>

In [21]:
print(s)

a    23
b     9
c    15
d   -90
e   -21
dtype: int64


In [22]:
s%10

a    3
b    9
c    5
d    0
e    9
dtype: int64

In [23]:
s/4

a     5.75
b     2.25
c     3.75
d   -22.50
e    -5.25
dtype: float64

In [24]:
np.log(np.abs(s)) #log and abs functions are in the numpy library, hence, np.log, np.abs is used.

a    3.135494
b    2.197225
c    2.708050
d    4.499810
e    3.044522
dtype: float64

<b><i>Evaluating values for duplicacies</i></b>

In [25]:
label = ['first','second','third','fourth']
dup_ser = pd.Series([1,4,4,-3], index = label)

In [26]:
print(dup_ser)

first     1
second    4
third     4
fourth   -3
dtype: int64


In [27]:
dup_ser.unique()
#Unique function Lists the unique values of the series in an array.

array([ 1,  4, -3], dtype=int64)

In [28]:
type(dup_ser.unique())

numpy.ndarray

In [29]:
dup_ser.value_counts() #counts the no. of times an entry has appeared in the Series.

 4    2
-3    1
 1    1
dtype: int64

In [30]:
s1.isin([1,2,11])
# this functions states whether 1,2 or 3 'is-in' s1 or not.
# this is another attribute for series.

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [31]:
s1[s1.isin([1,2,11])] #returns the values, that are either 1, 2 or 3

0    1
dtype: int64

In [32]:
a1 = pd.Series([12,10,np.NaN,43,25], index = ['a','b','c','d','e'])
#defining NaN(Not a Number) value. in the series, using np.NaN

In [33]:
print(a1)

a    12.0
b    10.0
c     NaN
d    43.0
e    25.0
dtype: float64


In [34]:
a1.isnull()
#the isnull function, returns true, if the value is NaN

a    False
b    False
c     True
d    False
e    False
dtype: bool

In [35]:
a1[a1.isnull()]

c   NaN
dtype: float64

In [36]:
a1.notnull()
#The notnull function, returns true, if the value is not an NaN.

a     True
b     True
c    False
d     True
e     True
dtype: bool

Series as <b>Dictionaries</b>.

In [37]:
mydict = {"red":100, "blue":200, "yellow":1000, "orange":400}
print(mydict)

{'red': 100, 'blue': 200, 'yellow': 1000, 'orange': 400}


In [38]:
a2 = pd.Series(mydict) #casting this dictionary into Series

In [39]:
print(a2)

red        100
blue       200
yellow    1000
orange     400
dtype: int64


In [40]:
#Notice this change.
a3 = pd.Series(mydict, index = ["r","b","y","o"])
print(a3)

r   NaN
b   NaN
y   NaN
o   NaN
dtype: float64


In [41]:
#Now notice the change in this
colors = ["red","blue","green","yellow","black","orange"]
a4 = pd.Series(mydict,index = colors)

In [42]:
print(a4)

red        100.0
blue       200.0
green        NaN
yellow    1000.0
black        NaN
orange     400.0
dtype: float64


In [43]:
a4+a2

black        NaN
blue       400.0
green        NaN
orange     800.0
red        200.0
yellow    2000.0
dtype: float64

Now Defining a dataset, with the help of a <b>DataFrame</b>.

In [44]:
#Defining a dataset, first with the help of a dictionary
data = {
        'colors':['red','black','green','yellow'],
        'items':['Pen','Ball','Leaf','Cream'],
        'prices':[10,25,0.19,45]
       }

In [45]:
#Now casting the dictionary into a DataFrame.
df0 = pd.DataFrame(data,index = ['a','b','c','d'])

In [46]:
df0

Unnamed: 0,colors,items,prices
a,red,Pen,10.0
b,black,Ball,25.0
c,green,Leaf,0.19
d,yellow,Cream,45.0


In [47]:
#Defining this new DataFrame, using previous data dictionary, but with only two columns.
frame2 = pd.DataFrame(data, columns = ['items','prices'])
frame2

Unnamed: 0,items,prices
0,Pen,10.0
1,Ball,25.0
2,Leaf,0.19
3,Cream,45.0


<i><b>DataFrame attributes<i><b>

In [48]:
df0.columns

Index(['colors', 'items', 'prices'], dtype='object')

In [49]:
df0.values

array([['red', 'Pen', 10.0],
       ['black', 'Ball', 25.0],
       ['green', 'Leaf', 0.19],
       ['yellow', 'Cream', 45.0]], dtype=object)

In [50]:
df0.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [51]:
frame2.index #since, index is not explicitly defined

RangeIndex(start=0, stop=4, step=1)

In [52]:
A = np.arange(16).reshape((4,4))
#defining a Matrix.

In [53]:
df1 = pd.DataFrame(A, index = ['row1','row2','row3','row4'], 
                   columns = ['col1','col2','col3','col4'])

In [54]:
df1

Unnamed: 0,col1,col2,col3,col4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [55]:
df1.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [56]:
B = np.round(np.random.random((4,4))*10)

In [57]:
print(B)

[[4. 0. 4. 5.]
 [8. 5. 7. 4.]
 [4. 2. 8. 8.]
 [6. 5. 6. 6.]]


In [58]:
B.dtype

dtype('float64')

In [59]:
colors = ['red','black','orange','yellow']
items = ['pen','pencil','charts','balls']

In [60]:
df2 = pd.DataFrame(B, index = items, columns = colors)

In [61]:
df2

Unnamed: 0,red,black,orange,yellow
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [62]:
# giving a name to the set of columns, and to the set of rows(index)
df2.index.name = 'Items'
df2.columns.name = 'colors'

In [63]:
df2

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [64]:
#ways of selecting columns
df2[["black","yellow"]]

colors,black,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1
pen,0.0,5.0
pencil,5.0,4.0
charts,2.0,8.0
balls,5.0,6.0


In [65]:
#Another way of selecting a column, notice here that, inverted commas are not required.
df2.black

Items
pen       0.0
pencil    5.0
charts    2.0
balls     5.0
Name: black, dtype: float64

In [66]:
#Way of selecting the elements of a row.
df2.loc['balls']

colors
red       6.0
black     5.0
orange    6.0
yellow    6.0
Name: balls, dtype: float64

In [67]:
df2.loc[:]

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [68]:
df2.loc[['pen','charts']]

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
charts,4.0,2.0,8.0,8.0


In [69]:
df2.loc['pen':'charts']

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0


In [70]:
df2.head()

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [71]:
df2.tail()

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [72]:
df2.head(2)

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pen,4.0,0.0,4.0,5.0
pencil,8.0,5.0,7.0,4.0


In [73]:
df2.tail(2)

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
charts,4.0,2.0,8.0,8.0
balls,6.0,5.0,6.0,6.0


In [74]:
df2.columns

Index(['red', 'black', 'orange', 'yellow'], dtype='object', name='colors')

In [75]:
df2.index

Index(['pen', 'pencil', 'charts', 'balls'], dtype='object', name='Items')

In [76]:
# How many orange pencils are there ?
df2['orange']['pencil']
#important thing to notice here is that, column name comes in the first bracket, and then the row name.

7.0

In [77]:
# How many black pens are there ?
df2['black']['pen']

0.0

In [78]:
df2[1:3]

colors,red,black,orange,yellow
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pencil,8.0,5.0,7.0,4.0
charts,4.0,2.0,8.0,8.0


In [79]:
#adding an extra column
df2['green'] = np.random.randint([10,10,10,10])
df2

colors,red,black,orange,yellow,green
Items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pen,4.0,0.0,4.0,5.0,0
pencil,8.0,5.0,7.0,4.0,5
charts,4.0,2.0,8.0,8.0,0
balls,6.0,5.0,6.0,6.0,9


In [80]:
type(df2)

pandas.core.frame.DataFrame