### Pandas
<p> Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. </p>

In [2]:
# Initial step is to import pandas
import pandas as pd
import numpy as np

In [3]:
# Creating Dataframe
df = pd.DataFrame(np.arange(0,25).reshape(5,5), index=['Row-01', 'Row-02', 'Row-03', 'Row-04', 'Row-05'], columns=['Column-01', 'Column-02', 'Column-03', 'Column-04', 'Column-05'])
# Index should be given based on the number of Rows & Columns



In [28]:
df["Column-04"]

Row-01     3
Row-02     8
Row-03    13
Row-04    18
Row-05    23
Name: Column-04, dtype: int32

In [29]:
type(df["Column-04"])

pandas.core.series.Series

In [31]:
# To see certain multiple columns
df[["Column-02","Column-05"]]

Unnamed: 0,Column-02,Column-05
Row-01,1,4
Row-02,6,9
Row-03,11,14
Row-04,16,19
Row-05,21,24


In [4]:
df.head()

Unnamed: 0,Column-01,Column-02,Column-03,Column-04,Column-05
Row-01,0,1,2,3,4
Row-02,5,6,7,8,9
Row-03,10,11,12,13,14
Row-04,15,16,17,18,19
Row-05,20,21,22,23,24


In [5]:
# Create CSV (Comma Seoarated Value) file from existing data
df.to_csv("TestFile.csv")

In [6]:
# Accessing the elements. There are two ways to access the elements.
# (i) .loc (focus on row index)  and (ii) .iloc (focus on both row & column index)
df.loc['Row-01']


Column-01    0
Column-02    1
Column-03    2
Column-04    3
Column-05    4
Name: Row-01, dtype: int32

In [7]:
# Check the type
type(df.loc['Row-01'])

pandas.core.series.Series

#### Dataframe & Data Series
<p>Dataframe is the combination of rows and columns. Whereas data series is the combination of rows only.</p>


In [8]:
df.iloc[:,:]  # Shows all the rows and columns

Unnamed: 0,Column-01,Column-02,Column-03,Column-04,Column-05
Row-01,0,1,2,3,4
Row-02,5,6,7,8,9
Row-03,10,11,12,13,14
Row-04,15,16,17,18,19
Row-05,20,21,22,23,24


In [12]:
df.iloc[0:2, 0:3]   # Shows the certain rows and columns

Unnamed: 0,Column-01,Column-02,Column-03
Row-01,0,1,2
Row-02,5,6,7


In [16]:
# Check the type
type(df.iloc[0:2,0:3])

pandas.core.frame.DataFrame

In [13]:
df.iloc[:, 0:2]    # Shows the certain columns only

Unnamed: 0,Column-01,Column-02
Row-01,0,1
Row-02,5,6
Row-03,10,11
Row-04,15,16
Row-05,20,21


In [15]:
df.iloc[0:3,:]    # Shows the certain rows only

Unnamed: 0,Column-01,Column-02,Column-03,Column-04,Column-05
Row-01,0,1,2,3,4
Row-02,5,6,7,8,9
Row-03,10,11,12,13,14


In [17]:
# Take the elements from 3rd column
df.iloc[:, 2:]

Unnamed: 0,Column-03,Column-04,Column-05
Row-01,2,3,4
Row-02,7,8,9
Row-03,12,13,14
Row-04,17,18,19
Row-05,22,23,24


In [18]:
# Convert Dataframe into Array
df.iloc[:, 2:].values

array([[ 2,  3,  4],
       [ 7,  8,  9],
       [12, 13, 14],
       [17, 18, 19],
       [22, 23, 24]])

In [19]:
# check the shape of the Array
df.iloc[:, 2:].values.shape

(5, 3)

In [20]:
# Check null values of a dataframe
df.isnull().sum()

Column-01    0
Column-02    0
Column-03    0
Column-04    0
Column-05    0
dtype: int64

In [21]:
df

Unnamed: 0,Column-01,Column-02,Column-03,Column-04,Column-05
Row-01,0,1,2,3,4
Row-02,5,6,7,8,9
Row-03,10,11,12,13,14
Row-04,15,16,17,18,19
Row-05,20,21,22,23,24


In [25]:
# check unique catagories
df['Column-03'].value_counts()

7     1
22    1
12    1
2     1
17    1
Name: Column-03, dtype: int64

In [26]:
# It also check uniqueness
df['Column-03'].unique()

array([ 2,  7, 12, 17, 22], dtype=int64)

### Reading CSV files with various Parameters

In [37]:
# Read CSV file
df_test = pd.read_csv('Test.csv', sep = '.')
df_test.head()

Unnamed: 0.1,Unnamed: 0,Column-01,Column-02,Column-03,Column-04,Column-05
0,Row-01,0,1,2,3,4
1,Row-02,5,6,7,8,9
2,Row-03,10,11,12,13,14
3,Row-04,15,16,17,18,19
4,Row-05,20,21,22,23,24


In [32]:
# Read the CSV File
df = pd.read_csv('mercedesbenz.csv')   # Data set has been collected from kaggle

In [33]:
# show the elements
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# show the information of the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [35]:
# describe() shows & analyze only the integer & float value. It skips category features.
df.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [38]:
# Get the unique category counts
df['X0'].value_counts()

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
am     18
aq     18
i      18
u      17
aw     16
l      16
ad     14
k      11
au     11
b      11
r      10
as     10
bc      6
ao      4
c       3
q       2
aa      2
g       1
ac      1
ab      1
Name: X0, dtype: int64