In [2]:
# Pandas - dataframes
#class pandas.DataFrame(data=None, 
#                         index=None, 
#                         columns=None, 
#                         dtype=None, 
#                         copy=False)[source]
import pandas as pd
import numpy as np

In [3]:
data = {'Country': ['Belgium', 'India', 'Brazil'],
        'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
        'Population': [122020, 7774744, 3664838]}

df = pd.DataFrame(data)
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,122020
1,India,New Delhi,7774744
2,Brazil,Brasilia,3664838


In [4]:
type(df)

pandas.core.frame.DataFrame

In [5]:
# Constructing DataFrame from a dictionary.
# Example - 1 
data_dict = {'col1': [1, 2], 'col2': [3, 4], 'col3': ['a','b']}

In [6]:
# create the dataframe
df = pd.DataFrame(data=data_dict)
df

Unnamed: 0,col1,col2,col3
0,1,3,a
1,2,4,b


In [7]:
# major dataype
df.dtypes


col1     int64
col2     int64
col3    object
dtype: object

In [9]:
# shape of the dataframe
df.shape

(2, 3)

In [8]:
# Example - 2 
data_dict = {'one' : pd.Series([1., 2., 3.],     index=['a', 'b', 'c']),
             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [9]:
df = pd.DataFrame(data_dict)

In [10]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [11]:
df = pd.DataFrame(data_dict, index=['d', 'b', 'a'])
df

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [14]:
df= pd.DataFrame(data_dict, index=['d', 'b', 'a'], columns=['PYTHON', 'SPARK'])
df

Unnamed: 0,PYTHON,SPARK
d,,
b,,
a,,


In [15]:
df= pd.DataFrame(data_dict, index=['d', 'b', 'a'], columns=['two', 'one'])
df

Unnamed: 0,two,one
d,4.0,
b,2.0,2.0
a,1.0,1.0


In [12]:
df.index

Index(['d', 'b', 'a'], dtype='object')

In [13]:
df.columns

Index(['one', 'two'], dtype='object')

In [17]:
# Constructing DataFrame from numpy ndarray/Lists
# Example - 1
data = {'one' : [1., 2., 3., 4.],
        'two' : [4., 3., 2., 1.]}

In [18]:
pd.DataFrame(data)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [19]:
pd.DataFrame(data, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [14]:
# From a list of dicts
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [15]:
type(data)

list

In [21]:
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [22]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [24]:
pd.DataFrame(data, columns=['a', 'b'], index=['first', 'second'])

Unnamed: 0,a,b
first,1,2
second,5,10


In [16]:
# Column selection, addition, deletion
data_dict = {'one' : pd.Series([1., 2., 3.],     index=['a', 'b', 'c']),
             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df= pd.DataFrame(data_dict)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [20]:
df1=df['one']

In [21]:
df1

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [24]:
# Column selection, addition, deletion

df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [25]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [22]:
df['three'] = df['one'] * df['two']

In [23]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [24]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [29]:
del df['three']
df

Unnamed: 0,one,two,flag
a,1.0,1.0,False
b,2.0,2.0,False
c,3.0,3.0,True
d,,4.0,False


In [30]:
# When inserting a scalar value, it will naturally be propagated to fill 
# the column:
df['foo'] = 'bar'
df

Unnamed: 0,one,two,flag,foo
a,1.0,1.0,False,bar
b,2.0,2.0,False,bar
c,3.0,3.0,True,bar
d,,4.0,False,bar


In [33]:
# handling missing values


In [25]:
series1 = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'])
series2 = pd.Series([1,2,3,4,5], index=['c', 'e', 'f', 'g', 'h'])

In [26]:
sum_series = series1 + series2
sum_series

a    NaN
b    NaN
c    4.0
d    NaN
e    7.0
f    NaN
g    NaN
h    NaN
dtype: float64

In [None]:
len(df['a'].isnull()

In [29]:
sum_series.isnull()

SyntaxError: unexpected EOF while parsing (<ipython-input-29-893173eb5a00>, line 1)

In [35]:
sum_series.dropna()

c    4.0
e    7.0
dtype: float64

In [36]:
sum_series

a    NaN
b    NaN
c    4.0
d    NaN
e    7.0
f    NaN
g    NaN
h    NaN
dtype: float64

In [37]:
dropped_na = sum_series.dropna()
dropped_na

c    4.0
e    7.0
dtype: float64

In [38]:
sum_series.fillna(100)

a    100.0
b    100.0
c      4.0
d    100.0
e      7.0
f    100.0
g    100.0
h    100.0
dtype: float64

In [31]:
# reading from external file
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\iris.csv', sep=',')


In [33]:
df.head(7)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa


In [34]:
# Assigning New Columns 
df1=df.assign(sepal_ratio = df['sepal_width'] / df['petal_length'])

In [35]:
df['sepal_ratio1'] = df['sepal_width'] / df['sepal_length']

In [36]:
df1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,sepal_ratio
0,5.1,3.5,1.4,0.2,Iris-setosa,2.5
1,4.9,3.0,1.4,0.2,Iris-setosa,2.142857
2,4.7,3.2,1.3,0.2,Iris-setosa,2.461538
3,4.6,3.1,1.5,0.2,Iris-setosa,2.066667
4,5.0,3.6,1.4,0.2,Iris-setosa,2.571429


In [98]:
# Indexing / Selection 

In [37]:
df.loc[140]

sepal_length               6.7
sepal_width                3.1
petal_length               5.6
petal_width                2.4
class           Iris-virginica
sepal_ratio1          0.462687
Name: 140, dtype: object

In [102]:
# Data alignment and arithmetic

In [38]:
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])

In [39]:
df1

Unnamed: 0,A,B,C,D
0,-1.335286,1.107863,0.809134,-1.150537
1,1.154141,0.071517,-1.05595,0.956442
2,-1.595736,0.367472,-0.216715,-0.403777
3,-0.570134,-2.141959,0.625648,0.832861
4,-1.360187,0.053981,-0.797957,0.581436
5,1.264176,-0.389289,1.590848,-0.328245
6,2.526211,0.049261,-0.725603,-0.884483
7,-1.465417,-0.787886,1.660989,-0.335721
8,-0.469233,0.825582,-1.431205,-1.12474
9,0.013209,0.264547,0.512835,-0.83001


In [40]:
df2

Unnamed: 0,A,B,C
0,-0.143725,-0.536106,-0.457485
1,-0.618142,1.294114,1.416328
2,-0.438727,-0.949875,0.451918
3,-0.644482,-0.194543,-0.153764
4,0.201983,0.431096,0.130574
5,2.854452,-0.133305,-0.645145
6,0.612714,1.084934,1.25268


In [41]:
df1 + df2

Unnamed: 0,A,B,C,D
0,-1.479011,0.571757,0.351649,
1,0.535999,1.365631,0.360378,
2,-2.034463,-0.582403,0.235203,
3,-1.214616,-2.336502,0.471884,
4,-1.158204,0.485076,-0.667383,
5,4.118628,-0.522593,0.945703,
6,3.138925,1.134196,0.527078,
7,,,,
8,,,,
9,,,,


In [43]:
df1[['A', 'B']]

Unnamed: 0,A,B
0,-1.335286,1.107863
1,1.154141,0.071517
2,-1.595736,0.367472
3,-0.570134,-2.141959
4,-1.360187,0.053981
5,1.264176,-0.389289
6,2.526211,0.049261
7,-1.465417,-0.787886
8,-0.469233,0.825582
9,0.013209,0.264547


In [50]:
# boolean operators
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)

df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)

In [51]:
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [52]:
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [53]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [111]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [112]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [113]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [114]:
# Transposing

In [115]:
df = pd.DataFrame(np.random.randn(8, 3), columns=list('ABC'))

In [116]:
df

Unnamed: 0,A,B,C
0,-0.4346,-0.090879,0.465793
1,0.06812,-0.583987,-1.836404
2,0.741085,0.327871,0.93059
3,0.702963,-0.689086,-0.267745
4,0.463092,0.76231,0.075687
5,0.601473,0.239792,0.454828
6,1.715211,0.660913,0.916108
7,0.134098,-0.050159,0.152794


In [117]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7
A,-0.4346,0.06812,0.741085,0.702963,0.463092,0.601473,1.715211,0.134098
B,-0.090879,-0.583987,0.327871,-0.689086,0.76231,0.239792,0.660913,-0.050159
C,0.465793,-1.836404,0.93059,-0.267745,0.075687,0.454828,0.916108,0.152794
