# Pandas

Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

Agenda
* What is Data Frames?
* What is Data Series?
* Different operation in Pandas 

In [1]:
## Importing Pandas
import pandas as pd
import numpy as np

In [2]:
## Creating DataFrame
df=pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=["Column","Column2","Column3","Column4"])

In [3]:
df.head()

Unnamed: 0,Column,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [4]:
## Creating csv from dataframe
df.to_csv('test1.csv')

In [5]:
## Accessing Elements 2 ways
## 1 = .loc
## 2 = .iloc
df.loc['Row1']

Column     0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [6]:
# series is either 1 row or 1 column
type(df.loc['Row1']) 

pandas.core.series.Series

In [7]:
df.iloc[ 0:2 , 0:2]

Unnamed: 0,Column,Column2
Row1,0,1
Row2,4,5


In [8]:
type(df.iloc[ 0:2 , 0:2])

pandas.core.frame.DataFrame

In [9]:
df.iloc[ 0:2 , 0:1]

Unnamed: 0,Column
Row1,0
Row2,4


In [10]:
type(df.iloc[ 0:2 , 0:1])

pandas.core.frame.DataFrame

In [11]:
df.iloc[ 0:2 , 0]

Row1    0
Row2    4
Name: Column, dtype: int32

In [12]:
type(df.iloc[ 0:2 , 0])

pandas.core.series.Series

In [13]:
df.iloc[ 0:1 , 0:1]

Unnamed: 0,Column
Row1,0


In [14]:
type(df.iloc[ 0:1 , 0:1])

pandas.core.frame.DataFrame

In [15]:
df.iloc[ 0 , 0:2]

Column     0
Column2    1
Name: Row1, dtype: int32

In [16]:
type(df.iloc[ 0 , 0:2])

pandas.core.series.Series

### Converting Data Frame into Arrays 

In [17]:
df.iloc[:,:].values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [18]:
df.iloc[1:,2:].values

array([[ 6,  7],
       [10, 11],
       [14, 15],
       [18, 19]])

In [19]:
df.iloc[:,:].values.shape # 5 rows and 3 columns

(5, 4)

In [20]:
df

Unnamed: 0,Column,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [21]:
df.isnull().sum()

Column     0
Column2    0
Column3    0
Column4    0
dtype: int64

In [22]:
df['Column'].value_counts() # give frequency of column values

0     1
4     1
8     1
12    1
16    1
Name: Column, dtype: int64

In [23]:
df['Column'].unique() # give unique values in dataframe column

array([ 0,  4,  8, 12, 16])

In [24]:
df['Column3']

Row1     2
Row2     6
Row3    10
Row4    14
Row5    18
Name: Column3, dtype: int32

In [25]:
df['Colum2','Column3'] # Error

KeyError: ('Colum2', 'Column3')

In [26]:
df[['Column2','Column3']]

Unnamed: 0,Column2,Column3
Row1,1,2
Row2,5,6
Row3,9,10
Row4,13,14
Row5,17,18


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row1 to Row5
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Column   5 non-null      int32
 1   Column2  5 non-null      int32
 2   Column3  5 non-null      int32
 3   Column4  5 non-null      int32
dtypes: int32(4)
memory usage: 292.0+ bytes


In [31]:
df.describe()

Unnamed: 0,Column,Column2,Column3,Column4
count,5.0,5.0,5.0,5.0
mean,8.0,9.0,10.0,11.0
std,6.324555,6.324555,6.324555,6.324555
min,0.0,1.0,2.0,3.0
25%,4.0,5.0,6.0,7.0
50%,8.0,9.0,10.0,11.0
75%,12.0,13.0,14.0,15.0
max,16.0,17.0,18.0,19.0


In [32]:
test_df=pd.read_csv('test1.csv')

In [33]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,Column,Column2,Column3,Column4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19


In [34]:
df.head()

Unnamed: 0,Column,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19
