## Import Dependencies

In [18]:
import csv
from pathlib import Path
import pandas as pd
import glob
from datetime import datetime, timedelta
import numpy as np
# from sklearn import preprocessing
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, RidgeCV
# from sklearn.metrics import mean_squared_error
import seaborn as sns
import pandas_profiling as pp
from settings import DOWNLOAD_DIR
# from settings import DATA_DIR

In [19]:
DOWNLOAD_DIR

'download'

# Get the Single-Family Loan Performance Data Files from Fannie Mae

From the Fannie Mae [website:](https://loanperformancedata.fanniemae.com/lppub/index.html#Single-Family_Loan_Performance_Data_Files) we downloaded the Acquisition dataset (998MB zip file)

The [Acquisition file layout](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_File_layout.pdf) was as follows:
![Acquisition File Layout](images/AcquisitionFileLayoutMedium.jpg)

In [2]:
# Create mapping for the header of the files based on above file format
headerline = ['loanIdentifier','origChannel','sellerName','origIntRate','origUPB','origLoanTerm',\
         'origDate','firstPmtDate','origLTV','origCLTV','numBorrowers','origDebtIncRatio',\
         'borrCreditScore','firstTHBI','loanPurp','propType','numUnits','occType','propState',\
         'zipCode','pMIperct','prodType','coborrCreditScore','mortInsType','relocMortInd']

In [3]:
# Loop through each file from the unzipped download file - which gave a set of 76 individual files

#Create an empty array to hold dataframes to later concatonate
li = []

#Loop through every file in the directory and append into array
for in_path in Path('data/Acquisition').glob('*.txt'):
    df = pd.read_csv(in_path, sep="|", index_col=None, header=None)
    li.append(df)

#Concatonate all the array elements into one giant dataframe
df = pd.concat(li, axis=0, ignore_index=True)
df.columns = headerline

ValueError: No objects to concatenate

In [None]:
df.shape

In [None]:
#Review the columns for datatypes and how many non-null values 
df.info()

In [None]:
df.head()

### Convert the origDate,firstPmtDate columns from strings to datetime datatypes
Since this will be such a large dataset I tested a number of methods to change datatypes

`%timeit df['origDate'] = pd.to_datetime(df['origDate'], format='%m/%Y')`  
5.43 ms ± 457 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

`%timeit df['origDate'].apply(lambda _: datetime.strptime(_,"%m/%Y"))')`  
5.48 ms ± 91.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Not much difference in speeds so I'll choose the `to_datetime` method

In [None]:
df['origDate'] = pd.to_datetime(df['origDate'], format='%m/%Y')
df['firstPmtDate'] = pd.to_datetime(df['firstPmtDate'], format='%m/%Y')

### Filter out records with values we aren't interested in examining:

Using the [Glossary](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf) of column names we made the following selections:
![Property Type](images/propertyType.jpg)
* Property type: only want 'SF' - 'Single Family'

In [None]:
propTypeFilter = ['SF']
df = df[df.propType.isin(propTypeFilter)]
df.shape

![Number of Units](images/numUnits.jpg)
* Number of Units: only want '1'

In [None]:
numUnitsFilter = [1]
df = df[df.numUnits.isin(numUnitsFilter)]
df.shape

![Number of Units](images/occType.jpg)
* Number of Units: only want 'P'

In [None]:
occTypeFilter = ['P']
df = df[df.occType.isin(occTypeFilter)]
df.shape

![Number of Units](images/loanPurp.jpg)
* Number of Units: only want 'C,R,U'

In [None]:
loanPurpFilter = ['C','R','U']
df = df[df.loanPurp.isin(loanPurpFilter)]
df.shape

![Number of Units](images/origLoanTerm.jpg)
* Number of Units: only want '360'

In [None]:
origLoanTermFilter = [360]
df = df[df.origLoanTerm.isin(origLoanTermFilter)]
df.shape

### Fill null values with 0 for the following columns
We can't have NaN or Null values for the machine learning part to come

In [None]:
df['mortInsType'] = df['mortInsType'].fillna(0)
df['pMIperct'] = df['pMIperct'].fillna(0)
df['origCLTV'] = df['origCLTV'].fillna(0)

## Calculate the 'Best', 'Worst' and 'Avg' Credit Score
Given that there is a Borrower Credit Score and a Co-Borrower Credit Score and that there are Null values for each we want to create calculated columns where no Null values exist and then drop the original columns

In [None]:
df['bestCreditScore'] = df[['borrCreditScore','coborrCreditScore']].max(axis=1)
df['worstCreditScore'] = df[['borrCreditScore','coborrCreditScore']].min(axis=1)
df['avgCreditScore'] = df[['borrCreditScore','coborrCreditScore']].mean(axis=1)

### Get rid of records where there are NaN or NULL values for:
* 'Borrower Credit Score'
* 'Debt to Income Ratio'
* 'Number of Borrowers'

In [None]:
df = df[df['borrCreditScore'].notna() &\
        df['origDebtIncRatio'].notna() &\
        df['numBorrowers'].notna()]

### Drop all columns where there is only a single value (and coborrower Credit Score)

In [None]:
df = df.drop(['prodType','firstTHBI','relocMortInd','occType','numUnits','origLoanTerm','coborrCreditScore','propType'], axis=1)

### Map text values to integers (Machine Learning will require this later)

![OrigChannel](images/origChannel.jpg)

In [None]:
df['origChannel'] = df['origChannel'].replace(['R','B','C'],[1,2,3])
df['origChannel'] = pd.to_numeric(df['origChannel'])

![Loan Purpose](images/loanPurp.jpg)

In [None]:
df['loanPurp'] = df['loanPurp'].replace(['C','R','U'],[1,2,3])
df['loanPurp'] = pd.to_numeric(df['loanPurp'])

In [None]:
df['loanPurp'].unique()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Check if updated files from Fannie Mae have created duplicate entries for a given loan Identifier 
#(number should equal the number of records if there are no duplicates)
df.groupby('loanIdentifier').loanIdentifier.nunique().sum()

In [None]:
sellerNames = df['sellerName'].unique()
statesList = df['propState'].unique()
#sellerNames

In [None]:
#sellerNames

In [None]:
df.to_csv('data/FMAcqProcessed.csv')