# A quick overview of `Pandas`: Part $I$

We use USA public mortgage data available from [HMDA](https://www.consumerfinance.gov/data-research/hmda/)

(Later we will use the same data for visualization)

In [1]:
import pandas as pd

Reading in the data

In [2]:
data = pd.read_csv('../data/loan_data_subset.csv')

View the first few rows of the file and set the number of columns displayed. 

In [3]:
pd.set_option('display.max_columns', 80) 
data.head(3)

Unnamed: 0,action_taken,action_taken_name,agency_code,agency_abbr,agency_name,applicant_ethnicity,applicant_ethnicity_name,applicant_income_000s,applicant_race_1,applicant_race_name_1,applicant_sex,applicant_sex_name,census_tract_number,co_applicant_ethnicity,co_applicant_ethnicity_name,co_applicant_race_1,co_applicant_race_name_1,co_applicant_sex,co_applicant_sex_name,county_code,county_name,hoepa_status,hoepa_status_name,lien_status,lien_status_name,loan_purpose,loan_purpose_name,loan_type,loan_type_name,owner_occupancy,owner_occupancy_name,preapproval,preapproval_name,property_type,property_type_name,purchaser_type,purchaser_type_name,hud_median_family_income,loan_amount_000s,number_of_1_to_4_family_units,number_of_owner_occupied_units,minority_population,population,tract_to_msamd_income
0,1,Loan originated,2,FRS,Federal Reserve System,3,"Information not provided by applicant in mail,...",36.0,6,"Information not provided by applicant in mail,...",3,"Information not provided by applicant in mail,...",2555.0,5,No co-applicant,8,No co-applicant,5,No co-applicant,99.0,Macomb County,2,Not a HOEPA loan,1,Secured by a first lien,1,Home purchase,2,FHA-insured,1,Owner-occupied as a principal dwelling,2,Preapproval was not requested,1,One-to-four family dwelling (other than manufa...,2,Ginnie Mae (GNMA),79600.0,85,671.0,541.0,17.190001,1664.0,78.669998
1,1,Loan originated,5,NCUA,National Credit Union Administration,2,Not Hispanic or Latino,10.0,5,White,1,Male,4480.0,1,Hispanic or Latino,5,White,2,Female,161.0,Washtenaw County,2,Not a HOEPA loan,2,Secured by a subordinate lien,2,Home improvement,1,Conventional,1,Owner-occupied as a principal dwelling,3,Not applicable,1,One-to-four family dwelling (other than manufa...,0,Loan was not originated or was not sold in cal...,88300.0,22,1402.0,1305.0,6.63,3726.0,105.410004
2,4,Application withdrawn by applicant,9,CFPB,Consumer Financial Protection Bureau,1,Hispanic or Latino,28.0,5,White,1,Male,139.0,5,No co-applicant,8,No co-applicant,5,No co-applicant,81.0,Kent County,2,Not a HOEPA loan,1,Secured by a first lien,1,Home purchase,1,Conventional,1,Owner-occupied as a principal dwelling,3,Not applicable,1,One-to-four family dwelling (other than manufa...,0,Loan was not originated or was not sold in cal...,67000.0,119,1819.0,1542.0,24.24,6979.0,89.099998


Metadata:

- [Explanation of columns](https://www.dropbox.com/s/2tze8fy1cnep1u0/lar_record_format.pdf?dl=0) 
- [Explanation of codes](https://www.dropbox.com/s/3kxqdgwk6p9jqg5/lar_record_codes.pdf?dl=0)


### Determine the number of rows and columns in the dataset

In [4]:
data.shape

(5000, 44)

### Find the number of rows in the dataset

In [5]:
len(data)

5000

### Get the names of the columns

In [6]:
data.columns

Index(['action_taken', 'action_taken_name', 'agency_code', 'agency_abbr',
       'agency_name', 'applicant_ethnicity', 'applicant_ethnicity_name',
       'applicant_income_000s', 'applicant_race_1', 'applicant_race_name_1',
       'applicant_sex', 'applicant_sex_name', 'census_tract_number',
       'co_applicant_ethnicity', 'co_applicant_ethnicity_name',
       'co_applicant_race_1', 'co_applicant_race_name_1', 'co_applicant_sex',
       'co_applicant_sex_name', 'county_code', 'county_name', 'hoepa_status',
       'hoepa_status_name', 'lien_status', 'lien_status_name', 'loan_purpose',
       'loan_purpose_name', 'loan_type', 'loan_type_name', 'owner_occupancy',
       'owner_occupancy_name', 'preapproval', 'preapproval_name',
       'property_type', 'property_type_name', 'purchaser_type',
       'purchaser_type_name', 'hud_median_family_income', 'loan_amount_000s',
       'number_of_1_to_4_family_units', 'number_of_owner_occupied_units',
       'minority_population', 'population', 'tra

### Get the first five rows of a column by name

In [7]:
data['action_taken'][:5]

0    1
1    1
2    4
3    3
4    1
Name: action_taken, dtype: int64

### Create categorical ranges for numerical data

Note that that you can specifiy the number of ranges you wish.

In [8]:
incomeranges = pd.cut(data['applicant_income_000s'], 14)
incomeranges[:5]

0    (-5.945, 497.071]
1    (-5.945, 497.071]
2    (-5.945, 497.071]
3    (-5.945, 497.071]
4    (-5.945, 497.071]
Name: applicant_income_000s, dtype: category
Categories (14, interval[float64, right]): [(-5.945, 497.071] < (497.071, 993.143] < (993.143, 1489.214] < (1489.214, 1985.286] ... (4961.714, 5457.786] < (5457.786, 5953.857] < (5953.857, 6449.929] < (6449.929, 6946.0]]

### Look at the value counts in the ranges created above

In [9]:
pd.value_counts(incomeranges)

(-5.945, 497.071]       4446
(497.071, 993.143]        19
(993.143, 1489.214]        4
(2481.357, 2977.429]       2
(1985.286, 2481.357]       1
(6449.929, 6946.0]         1
(1489.214, 1985.286]       0
(2977.429, 3473.5]         0
(3473.5, 3969.571]         0
(3969.571, 4465.643]       0
(4465.643, 4961.714]       0
(4961.714, 5457.786]       0
(5457.786, 5953.857]       0
(5953.857, 6449.929]       0
Name: applicant_income_000s, dtype: int64

### Index into the first six columns of the first row

In [10]:
data.iloc[0,0:6]

action_taken                                1
action_taken_name             Loan originated
agency_code                                 2
agency_abbr                               FRS
agency_name            Federal Reserve System
applicant_ethnicity                         3
Name: 0, dtype: object

### Order the data by specified column

In [11]:
data['loan_amount_000s'].sort_values()[:5]

2103    1
482     1
1077    1
4227    1
3858    1
Name: loan_amount_000s, dtype: int64

### Sort by a column and then obtain a cross-section of that data

In [12]:
sorteddata = data.sort_values(['loan_amount_000s'])
sorteddata.iloc[:,0:6].head(3)

Unnamed: 0,action_taken,action_taken_name,agency_code,agency_abbr,agency_name,applicant_ethnicity
2103,6,Loan purchased by the institution,9,CFPB,Consumer Financial Protection Bureau,4
482,3,Application denied by financial institution,7,HUD,Department of Housing and Urban Development,2
1077,6,Loan purchased by the institution,9,CFPB,Consumer Financial Protection Bureau,4


### Obtain value counts of specifiec column

In [13]:
data['action_taken_name'].value_counts()

Loan originated                                  2788
Application denied by financial institution       716
Application withdrawn by applicant                585
Loan purchased by the institution                 564
File closed for incompleteness                    223
Application approved but not accepted             123
Preapproval request approved but not accepted       1
Name: action_taken_name, dtype: int64

### A way to obtain the datatype for every column

In [14]:
list(zip(data.columns, [type(x) for x in data.iloc[0,:]]))

[('action_taken', numpy.int64),
 ('action_taken_name', str),
 ('agency_code', numpy.int64),
 ('agency_abbr', str),
 ('agency_name', str),
 ('applicant_ethnicity', numpy.int64),
 ('applicant_ethnicity_name', str),
 ('applicant_income_000s', numpy.float64),
 ('applicant_race_1', numpy.int64),
 ('applicant_race_name_1', str),
 ('applicant_sex', numpy.int64),
 ('applicant_sex_name', str),
 ('census_tract_number', numpy.float64),
 ('co_applicant_ethnicity', numpy.int64),
 ('co_applicant_ethnicity_name', str),
 ('co_applicant_race_1', numpy.int64),
 ('co_applicant_race_name_1', str),
 ('co_applicant_sex', numpy.int64),
 ('co_applicant_sex_name', str),
 ('county_code', numpy.float64),
 ('county_name', str),
 ('hoepa_status', numpy.int64),
 ('hoepa_status_name', str),
 ('lien_status', numpy.int64),
 ('lien_status_name', str),
 ('loan_purpose', numpy.int64),
 ('loan_purpose_name', str),
 ('loan_type', numpy.int64),
 ('loan_type_name', str),
 ('owner_occupancy', numpy.int64),
 ('owner_occupancy_

### The Pandas way to obtain datatypes for every column

In [15]:
data.dtypes

action_taken                        int64
action_taken_name                  object
agency_code                         int64
agency_abbr                        object
agency_name                        object
applicant_ethnicity                 int64
applicant_ethnicity_name           object
applicant_income_000s             float64
applicant_race_1                    int64
applicant_race_name_1              object
applicant_sex                       int64
applicant_sex_name                 object
census_tract_number               float64
co_applicant_ethnicity              int64
co_applicant_ethnicity_name        object
co_applicant_race_1                 int64
co_applicant_race_name_1           object
co_applicant_sex                    int64
co_applicant_sex_name              object
county_code                       float64
county_name                        object
hoepa_status                        int64
hoepa_status_name                  object
lien_status                       

### Get the unique values for a column by name.

In [16]:
data['county_name'].unique()

array(['Macomb County', 'Washtenaw County', 'Kent County',
       'Genesee County', 'Mecosta County', 'Oakland County',
       'Ottawa County', 'Wayne County', 'Shiawassee County',
       'Muskegon County', 'Midland County', 'Cass County',
       'Ingham County', 'Grand Traverse County', 'Eaton County',
       'Otsego County', 'Kalamazoo County', 'Van Buren County',
       'Oceana County', 'Leelanau County', 'Benzie County', 'Bay County',
       'Monroe County', 'Saginaw County', nan, 'Hillsdale County',
       'Lapeer County', 'Isabella County', 'Calhoun County',
       'St. Clair County', 'Missaukee County', 'Iosco County',
       'Berrien County', 'Newaygo County', 'Gladwin County',
       'Clinton County', 'Livingston County', 'Barry County',
       'Emmet County', 'Alger County', 'Iron County', 'Antrim County',
       'Wexford County', 'St. Joseph County', 'Lenawee County',
       'Gratiot County', 'Clare County', 'Tuscola County',
       'Allegan County', 'Jackson County', 'Arena

### Get a count of the unique values of a column

In [17]:
len(data['county_name'].unique())

81

### Index into a column and get the first four rows

In [18]:
data['preapproval_name'].iloc[0:4]

0    Preapproval was not requested
1                   Not applicable
2                   Not applicable
3    Preapproval was not requested
Name: preapproval_name, dtype: object

### Obtain binary values

In [19]:
data['preapproval_name'].iloc[0:4] == "Preapproval was not requested"

0     True
1    False
2    False
3     True
Name: preapproval_name, dtype: bool

---
All done. We will continue with more tricks in part II.