# Getting Started with Pandas

In [166]:
import pandas as pd
import numpy as np

<h2> We will stat with basic object creation in Pandas. There are 2-types of Objects</h2>
    <ul>
        <li>Series</li>
        <li>DataFrames</li>
     </ul>
     
<h3> Series <h3>

Series are labelled 1-Dimensional array that can hold any data type ( integer, float, string, bool, objects)


In [115]:
#From numpy array
ds = pd.Series(np.random.randn(5),index=[1,2,3,4,5])
print(ds)

#From dict
ds1 = pd.Series({1:'A',2:'B'})
print(ds1)

#From dict - If index is passed, correspoding values from data is used.
ds2 = pd.Series({1:'A',2:'B'}, index=[2,3,4])
print(ds2)

1    0.350975
2   -1.747198
3    0.735294
4   -0.422302
5    1.637754
dtype: float64
1    A
2    B
dtype: object
2      B
3    NaN
4    NaN
dtype: object


In [124]:
# Series can have name of themselves as well
indexes = pd.date_range('2018-01-01',periods=7)
ds3 = pd.Series(indexes,name="CalendarEntries")
print(ds3)

# Series name can be renamed and a new copy is created
ds4 = ds3.rename('DateWiseEntries')
print(ds4)

# Changing ds4 values doesnt impact ds3, as it is a seperate copy
ds4[2]=np.datetime64('2010-01-01')



0   2018-01-01
1   2018-01-02
2   2018-01-03
3   2018-01-04
4   2018-01-05
5   2018-01-06
6   2018-01-07
Name: CalendarEntries, dtype: datetime64[ns]
0   2018-01-01
1   2018-01-02
2   2018-01-03
3   2018-01-04
4   2018-01-05
5   2018-01-06
6   2018-01-07
Name: DateWiseEntries, dtype: datetime64[ns]


0   2018-01-01
1   2018-01-02
2   2018-01-03
3   2018-01-04
4   2018-01-05
5   2018-01-06
6   2018-01-07
Name: CalendarEntries, dtype: datetime64[ns]

<h2> DataFrames </h2>

DataFrames is a 2-dimensional labeled data structure with columns of potentially different types. A DataFrame can hold any number of columns of varied datatypes

DataFrames can be created
<ul>
    <li>Dictionary of ndarrays,lists,dict,Series</li>
    <li>Series</li>
    <li>2-D numpy ndarrays</li>
</ul>
    

In [10]:
# Create a DataFrame using Dictionary
df = pd.DataFrame({'Country':['INDIA','PAK','AUS'],"Rank":[1,2,3]})
df

Unnamed: 0,Country,Rank
0,INDIA,1
1,PAK,2
2,AUS,3


In [12]:
#Create a DataFrame using List of rows
# If no column is provided, it gets default column indexing starting with 0, just like rows
df = pd.DataFrame([['INDIA',1],['PAK',2],['AUS',3]], columns=['Country','Rank'])
df

Unnamed: 0,Country,Rank
0,INDIA,1
1,PAK,2
2,AUS,3


In [137]:
# dict of Series

d = {'First':pd.Series(np.random.randn(4),index=['A','B','C','D']),
     'two':pd.Series(np.random.randn(4),index=['C','D','E','F'])}

# index of series objects will be merged to form index of dataframe
df = pd.DataFrame(d)
print(df)
#however, if dataframe provides its own index, index labels will be matched and 
#corresponding values from Series will be retained. Remember unlike list/arrays , Series are labelled

df = pd.DataFrame(d,index=['A','B','Y','Z'])
print(df)

      First       two
A -0.278737       NaN
B  0.771435       NaN
C -1.585859  0.232661
D  1.821817  1.121534
E       NaN -0.430927
F       NaN -0.057222
      First  two
A -0.278737  NaN
B  0.771435  NaN
Y       NaN  NaN
Z       NaN  NaN


In [147]:
#From dict of lists
d = {'A':[1,2,3,4],'B':[11,22,33,44]}
df = pd.DataFrame(d)
df

df = pd.DataFrame(d,index=np.random.randn(4), columns=('A','B','C'))
df

Unnamed: 0,A,B,C
0.661168,1,11,
2.377579,2,22,
-1.07889,3,33,
1.224664,4,44,


In [29]:
#If a sample has missing column values, It is assigned None (or NaN if numeric) by default
df = pd.DataFrame([['INDIA',1,True],['PAK',2],['AUS',3]],columns=['Country','Rank','Independent'])
df

Unnamed: 0,Country,Rank,Independent
0,INDIA,1,True
1,PAK,2,
2,AUS,3,


## Sorting a DataFrame

- DataFrames can be sorted by index or values. 

When sorting by index, We need to specify axis. axis = 0 (by row index) and axis = 1 (by column index)

When Sorting by value, We need to specify 'by = column name'

In [165]:
#If a sample has missing column values, It is assigned None (or NaN if numeric) by default
df = pd.DataFrame([['INDIA',1,True],['PAK',2],['AUS',3]],columns=['Country','Rank','Independent'])
df1 = df.sort_index(axis=1, ascending=False)
df2 = df.sort_values(by='Rank', ascending=True)
print(df1,"\n",df2)

   Rank Independent Country
0     1        True   INDIA
1     2        None     PAK
2     3        None     AUS 
   Country  Rank Independent
0   INDIA     1        True
1     PAK     2        None
2     AUS     3        None


## Selecting Values from dataframes
