In [1]:
# Pandas is a tool for data processing which helps in data analysis.
# It provides functions and methods to efficiently manipulate large datasets.
# The datasets may be the Large Excel sheets that we can access through rows and columns
# Data Structures in Pandas:
# 1. Series (One Dimensional array): Series is one dimensional array with labels.
# It can contain any data type including int, string, float, python objects
# Index:1 2 3 4 5
# Data: A B C D E
# 2. DataFrame(Two Dimensional array): It is a two dimensional data structure with labels.
# We can use labels to locate data. (Row and Column index.)

In [3]:
import pandas as pd
print(pd.__version__)

0.23.4


In [4]:
# Series create, manipulate, querry, delete
# creating a series from a list
arr = [0,1,2,3,4]
s1 = pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [5]:
order = [1,2,3,4,5] # Changing the order indexing from 1 rather than 0 manually.
s2 = pd.Series(arr, index=order)
s2

1    0
2    1
3    2
4    3
5    4
dtype: int64

In [6]:
order = ['a','b','c','d','e'] # Changing the order indexing to abcde
s2 = pd.Series(arr, index=order)
s2

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [7]:
# Create series from dictionary
d = {'a':1,'b':2,'c':3,'d':4,'e':5}
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [9]:
# You can modify the index of series
print(s1)
s1.index = ['A','B','C','D','E']
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64


A    0
B    1
C    2
D    3
E    4
dtype: int64

In [12]:
# Slicing
s1[:3] # First 3

A    0
B    1
C    2
dtype: int64

In [13]:
s4 = s1.append(s3)
s4

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

In [14]:
s4.drop('e') # Drop the element by index

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64

In [15]:
s4 # Chnages are temporary

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

In [17]:
# Series Operations
arr1 = [0,1,2,3,4,5,7]
arr2 = [6,7,8,9,5]
s5 = pd.Series(arr2)
s5

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [18]:
s6 = pd.Series(arr1)
s6

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [19]:
s5.add(s6) # It will directly add two arrays.

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [20]:
s5.sub(s6) # It will directly subtract two arrays.

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [21]:
s5.mul(s6)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [22]:
s5.div(s6)

0         inf
1    7.000000
2    4.000000
3    3.000000
4    1.250000
5         NaN
6         NaN
dtype: float64

In [25]:
print('Median: ',s6.median())
print('Max: ' ,s6.max())
print('Min: ' ,s6.min())
# It will drop values with 'NaN' and continue with other numbers

Median:  3.0
Max:  7
Min:  0


In [29]:
# Create Dataframe
import numpy as np
dates = pd.date_range('today', periods=6) # Define time sequence as index
num_arr=np.random.randn(6,4)
columns = ['A','B','C','D'] # Use the table as the column name

df1 = pd.DataFrame(num_arr,index = dates, columns=columns)
df1

Unnamed: 0,A,B,C,D
2019-06-04 20:27:24.307259,-0.211384,-0.564417,-0.340311,-0.192919
2019-06-05 20:27:24.307259,-1.841954,0.026925,0.225975,2.889779
2019-06-06 20:27:24.307259,0.253659,0.861772,-1.276899,-1.564184
2019-06-07 20:27:24.307259,0.2803,0.50819,0.240079,-1.157485
2019-06-08 20:27:24.307259,1.702729,1.470402,-0.348444,0.166493
2019-06-09 20:27:24.307259,0.984349,0.903009,-1.407656,-0.330107


In [31]:
data = {'animal': ['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
       'age': [2.5,3,0.5,np.nan, 5,2,4.5,np.nan,7,3],
       'visits': [1,3,2,3,2,3,1,1,2,1],
       'priority': ['yes','yes','no','yes','no','no','no','yes','no','no'],}

labels = ['a','b','c','d','e','f','g','h','i','j']

df2 = pd.DataFrame(data, index=labels)
df2


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [32]:
df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

In [35]:
df2.head() # Default it will show first 5 we can pass parameter in that head(2) etc.

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no


In [36]:
df2.tail() # Default it will show last 5 we can pass parameter in that tail(2) etc.

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [39]:
print(df2.index)
df2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


Index(['animal', 'age', 'visits', 'priority'], dtype='object')

In [40]:
df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'yes'],
       ['dog', 5.0, 2, 'no'],
       ['cat', 2.0, 3, 'no'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'yes'],
       ['dog', 7.0, 2, 'no'],
       ['dog', 3.0, 1, 'no']], dtype=object)

In [41]:
df2.describe() # See the statistical data of dataframe

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [42]:
df2.T # Transpose

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
animal,cat,cat,snake,dog,dog,cat,snake,cat,dog,dog
age,2.5,3,0.5,,5,2,4.5,,7,3
visits,1,3,2,3,2,3,1,1,2,1
priority,yes,yes,no,yes,no,no,no,yes,no,no


In [43]:
df2.sort_values(by='age')

Unnamed: 0,animal,age,visits,priority
c,snake,0.5,2,no
f,cat,2.0,3,no
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no
g,snake,4.5,1,no
e,dog,5.0,2,no
i,dog,7.0,2,no
d,dog,,3,yes
h,cat,,1,yes


In [44]:
# Slicing the dataframe
df2[1:3]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [45]:
df2.sort_values(by='age')[1:3]

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
a,cat,2.5,1,yes


In [46]:
# Query dataframes by tag
df2[['age', 'visits']]

Unnamed: 0,age,visits
a,2.5,1
b,3.0,3
c,0.5,2
d,,3
e,5.0,2
f,2.0,3
g,4.5,1
h,,1
i,7.0,2
j,3.0,1


In [47]:
df2.iloc[1:3] # Integer Location Query rows 2,3

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [48]:
df3 = df2.copy()
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [49]:
df3.isnull()

Unnamed: 0,animal,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False
f,False,False,False,False
g,False,False,False,False
h,False,True,False,False
i,False,False,False,False
j,False,False,False,False


In [51]:
df3.loc['f', 'age']=1.5
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [52]:
df3[['age']].mean() # We can take mean of all df3

age    3.375
dtype: float64

In [54]:
df3['visits'].sum()

19

In [55]:
df3['visits'].max()

3

In [56]:
df3.sum()

animal      catcatsnakedogdogcatsnakecatdogdog
age                                         27
visits                                      19
priority              yesyesnoyesnononoyesnono
dtype: object

In [3]:
import pandas as pd
import numpy as np
string = pd.Series(['A','C','D','Aaa','BaCa', np.nan, 'CBA', 'cow', 'owl'])
string.str.lower()

0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object

In [4]:
string.str.upper()

0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

In [6]:
# Operation for DataFrame missing value
df4 = df3.copy()
df4

NameError: name 'df3' is not defined