In [1]:
# Two libraries to be imported:
import pandas as pd
import numpy as np # When we do data analysis using pandas we also require numpy to operate it.

# Series:
Series is a one-dimensional labelled(categorised) array that holds any type of data such as integer, float, string, python objects, etc.

The basic method to create a series is:
> pandas.Series(data,index)

> where, data can be:-
1. ndarray
2. python dictionary
3. scalar value (e.g: 4)

>and 'index' is the list of axis labels

## Data as ndarray:-

In [2]:
# Providing data in the form of ndarray:-
s=pd.Series(np.array([1,2,3,4,5]),index=['a','b','c','d','e'])
# Note:- The length of index should be same as the data length.
s

a    1
b    2
c    3
d    4
e    5
dtype: int32

In [3]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
pd.Series(np.random.randn(5)) # giving 'index' is optional, if index is not passed one will create the values from 0 to len(data)-1

0    0.029643
1   -0.670353
2   -0.156167
3   -1.534679
4   -0.444513
dtype: float64

## Data as Dictionary:-

In [5]:
# Providing data in the form of dictionary:-
dic={'a':1,'b':2,'c':3,'d':4,'e':5}
s1=pd.Series(dic)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [6]:
# if we use index then the data will be ordered according to the index
pd.Series(dic,index=['c','d','a','f','b','e'])
# since we have given 'f' in index which is not pointing to any data so it will give NaN(not a number) which is standard missing data marker

c    3.0
d    4.0
a    1.0
f    NaN
b    2.0
e    5.0
dtype: float64

## Data as Scalar Value:-

In [7]:
s2=pd.Series(4.0,index=['a','b','c'])
s2

a    4.0
b    4.0
c    4.0
dtype: float64

## Series is like ndarray:-

It means you can access, index and slice the data in similar manner as you do in ndarray.

In [8]:
# Consider the above series
s

a    1
b    2
c    3
d    4
e    5
dtype: int32

In [9]:
s[0] # this is indexing

1

In [10]:
s[:2] # this is slicing and in slicing the index is also get sliced along with the data

a    1
b    2
dtype: int32

In [11]:
s[s>s.median()] # here the median out of [1,2,3,4,5] is 3.

d    4
e    5
dtype: int32

In [12]:
s[[0,1,2,3]]

a    1
b    2
c    3
d    4
dtype: int32

In [13]:
np.exp(s)

a      2.718282
b      7.389056
c     20.085537
d     54.598150
e    148.413159
dtype: float64

In [14]:
s

a    1
b    2
c    3
d    4
e    5
dtype: int32

In [15]:
s.array

<PandasArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int32

In [16]:
# to print actual ndarray from Series then use Series.to_numpy()
s.to_numpy()

array([1, 2, 3, 4, 5])

## Series is like dictionary:-

Series is like a fixed size dictionary in which you can get and set values using index label.

In [17]:
s['a']

1

In [18]:
s['d']

4

In [19]:
s['e']=10

In [20]:
s['e']

10

In [21]:
s['d']=9

In [22]:
s['d']

9

In [23]:
s

a     1
b     2
c     3
d     9
e    10
dtype: int32

In [24]:
'b' in s

True

In [25]:
'f' in s

False

## Vectorized Operations and Label Alignment with Series:-

In [26]:
s

a     1
b     2
c     3
d     9
e    10
dtype: int32

In [27]:
s+2

a     3
b     4
c     5
d    11
e    12
dtype: int32

In [28]:
s*2

a     2
b     4
c     6
d    18
e    20
dtype: int32

In [29]:
s/2

a    0.5
b    1.0
c    1.5
d    4.5
e    5.0
dtype: float64

In [30]:
np.cos(s)

a    0.540302
b   -0.416147
c   -0.989992
d   -0.911130
e   -0.839072
dtype: float64

In [31]:
s[1:]+s[:-1] # this will result in union of index labels in the series but the index which is present in one series and not in another series, there it will be marked as missing NaN  

a     NaN
b     4.0
c     6.0
d    18.0
e     NaN
dtype: float64

## name attribute:-

In [32]:
# name attribute is optional. It is used to give name to the series object i.e. to the column or we can say that by using name attribute we give a name to 1D dataset which can be used later in DataFrame to add it.
s.name="Data" #s=pd.Series(np.array([1,2,3,4,5]),index=['a','b','c','d','e'],name='Data')

In [33]:
s.name

'Data'

## rename() method:-

In [34]:
s3=s.rename('Data1')

In [35]:
s3.name
# Note: here s and s3 are two different objects

'Data1'

# DataFrame:

DataFrame is a 2D labelled data structure. You can think of it as a spreadsheet or an SQL table or a dictionary of Series objects.

The DataFrame also has differnt kind of inputs:-
1. Dict of 1D ndarrays, lists, dicts, or Series
2. 2-D numpy.ndarray
3. Structured or record ndarray
4. A Series
5. Another DataFrame

DataFrame can be created as:
> pandas.DataFrame(data,index,columns)

> where, 
>>data can be of different types as mentioned above

>>index(which are row labels or in simple words it act as serial no. of data) and columns(column labels or column names) are optional arguments

## Data as Dictionary of Series:-

In [36]:
import pandas as pd
import numpy as np

In [37]:
d={'one':pd.Series([1,2,3,4,5],index=['a','b','c','d','e']),
   'two':pd.Series(['first','second','third','fourth'],index=['a','b','c','d'])} # here the result of index will be union of both the index 
data=pd.DataFrame(d)
data

Unnamed: 0,one,two
a,1,first
b,2,second
c,3,third
d,4,fourth
e,5,


In [38]:
data.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [39]:
data.columns

Index(['one', 'two'], dtype='object')

In [40]:
data=pd.DataFrame(d,index=['a','b','c'],columns=['A','B'])
data
# since the column has been changed due to which the result is missing data NaN

Unnamed: 0,A,B
a,,
b,,
c,,


In [41]:
data=pd.DataFrame(d,index=['a','b','e'],columns=['one','two'])
data 

Unnamed: 0,one,two
a,1,first
b,2,second
e,5,


In [42]:
data=pd.DataFrame(d,index=[0,1,2,3,4],columns=['one','two'])
data
# since the index has been changed due to which the result is missing data NaN

Unnamed: 0,one,two
0,,
1,,
2,,
3,,
4,,


## Data as Dictionary of ndarray or list:-

In [43]:
dic={'one':[1,2,3,4,5],
     'two':['a','b','c','d','e']} # here all the arrays must be of same length
data=pd.DataFrame(dic)
data

Unnamed: 0,one,two
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [44]:
data=pd.DataFrame(dic, index=['A','B','C','D','E']) # here index length should be same as array length
data

Unnamed: 0,one,two
A,1,a
B,2,b
C,3,c
D,4,d
E,5,e


## Data as Series:-

In [45]:
data=pd.DataFrame(pd.Series([1,2,3,4,5])) # here the index is considered by default as well as the column is as the original name of series(only if no other column name is provided)
data

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [46]:
data=pd.DataFrame(pd.Series([1,2,3,4,5], name="Numbers")) #pd.DataFrame(pd.Series([1,2,3,4,5]), columns=['Numbers'])
data

Unnamed: 0,Numbers
0,1
1,2
2,3
3,4
4,5


## Data as Structured or record array:-

In [47]:
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data[:]=[(1,2.,'Hello'),(2,3.,'Hey')]
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'Hey'


## Data as List of dictionaries:-

In [48]:
l=[{'one':1,'two':2},{'one':3,'two':4}]
data=pd.DataFrame(l,index=[1,2])
data

Unnamed: 0,one,two
1,1,2
2,3,4


## Column selection, addition and deletion:-

In [49]:
# The dataframe is somewhat like a dictionary. The selection, addition and deletion in dataframe works with the same syntax as for the dictionary.
# consider the above data table
data

Unnamed: 0,one,two
1,1,2
2,3,4


In [50]:
# selecting column
data['one']

1    1
2    3
Name: one, dtype: int64

In [51]:
# adding column
data['three']=[10,30]
data

Unnamed: 0,one,two,three
1,1,2,10
2,3,4,30


In [52]:
# deleting a column
del data['two']
data

Unnamed: 0,one,three
1,1,10
2,3,30


In [53]:
# adding one more column
data['four']=4
data

Unnamed: 0,one,three,four
1,1,10,4
2,3,30,4


In [54]:
# assign() method in dataframe which allows you to create new column from existing one by which a new dataframe will be created which is the copy of older one along with the new column
data1=data.assign(five=(data['one']*data['four']))
data1

Unnamed: 0,one,three,four,five
1,1,10,4,4
2,3,30,4,12


## Basics of indexing / selection in dataframe:-

In [55]:
data

Unnamed: 0,one,three,four
1,1,10,4
2,3,30,4


In [56]:
# to select column
data['one']

1    1
2    3
Name: one, dtype: int64

In [57]:
# to select row by label or index
data.loc[2]

one       3
three    30
four      4
Name: 2, dtype: int64

In [58]:
# to select row by integer location
# this will print the row according to the default indexing 
data.iloc[0]

one       1
three    10
four      4
Name: 1, dtype: int64

In [59]:
data.iloc[1]

one       3
three    30
four      4
Name: 2, dtype: int64

## Transposing Attribute:-

In [60]:
# T attribute is used to transpose the dataframe
data.T

Unnamed: 0,1,2
one,1,3
three,10,30
four,4,4


In [62]:
# info() method is used to get the information of the dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 1 to 2
Data columns (total 3 columns):
one      2 non-null int64
three    2 non-null int64
four     2 non-null int64
dtypes: int64(3)
memory usage: 144.0 bytes


In [64]:
# dtype attribute is used to get the information of type of data can be stored in each column of dataframe
data.dtypes

one      int64
three    int64
four     int64
dtype: object