# Understanding Pandas #

### Importing Data from CSV ###

In [None]:
import pandas as pd
import os

dir_workspace = os.getcwd()
file_emp = os.path.join(dir_workspace,'emp.csv')
file_dept = os.path.join(dir_workspace,'dept.csv')
df_emp = pd.read_csv(file_emp)
df_dept = pd.read_csv(file_dept)
df_emp.head()

In [None]:
df_dept.head()

In [None]:
df_emp['DOJ'] = df_emp['DOJ'].astype('datetime64[ns]') 
df_emp.info()

### Column Selection ###

In [None]:
df_out = df_emp[['EMPNO','ENAME','DESIG','SALARY']]
df_out.head()

### Column Expression ###

In [None]:
df_out = df_emp
df_out['BONUS'] = df_out['SALARY'] * 0.2

df_out['STMT'] = df_out['ENAME'] + ' working as ' +  df_out['DESIG'] + ' earning salary of Rs.' + df_out.apply(lambda e : str(e.SALARY),axis=1)
df_out[['ENAME','BONUS','STMT']].head()


### Selection with ilocation ###

In [None]:
print('Selecting value from dataframe as a variable')
value = df_emp.iloc[3,1]
print('Name : '+value)
value = df_emp.iloc[3,5]
print('Salary : '+str(value))
print('===================================================')

print('Selecting value from dataframe as a column')
#df_out = df_emp.iloc[1:5] # With all columns 
#df_out = df_emp.iloc[:,1:5] # With All Rows
df_out = df_emp.iloc[1:7:,1:5] 
df_out.head(20)

### Selection with location ###

In [None]:
print('Selecting value from dataframe as a variable')
#df_out = df_emp.loc[1:5] # With all columns 
df_out = df_emp.loc[:,'ENAME':'DESIG'] # With All Rows
#df_out = df_emp.loc[1:7,'ENAME':'DESIG']
df_out.head(20)

### Selection with condition ###

In [None]:
df_out = df_emp[df_emp['DESIG'] == 'Officer']
df_out.head(20)

#### Selection with condition - AND ####

In [None]:
df_out = df_emp[(df_emp['DESIG'] == 'Officer') & (df_emp['SALARY']>=15000)]
df_out.head(20)

#### Selection with condition - OR ####

In [None]:
df_out = df_emp[(df_emp['DESIG'] == 'Officer') | (df_emp['SALARY']>=15000)]
df_out.head(20)

#### Selection with condition - IN Function ####

In [None]:
df_out = df_emp[df_emp['DESIG'].isin(['Manager','Officer'])]
df_out.head(20)

In [None]:
#### Selection with condition - Date Value ####

In [None]:
import datetime
df_out = df_emp[df_emp['DOJ'] > datetime.datetime(2007,1,1)]
df_out.head(20)


### Group By & Having ####

In [None]:
#df_emp.groupby('DESIG').count()[['ENAME']]
df_out = df_emp[['DESIG','SALARY']].groupby('DESIG').sum()
df_out.columns = ['Total']
df_out = df_out.reset_index()
print(df_out)
df_out = df_out[df_out['Total']>40000]
df_out.head()

### Group By with multiple Aggergate function ####

In [None]:
df_out = df_emp.groupby('DESIG').agg({'SALARY': ['count','mean', 'min', 'max','sum','std']})
df_out.columns = ['Count', 'Average', 'Minimum','Maximum','Total','Std_Dev']
df_out = df_out.reset_index()
df_out.head()