# **Analysis for Team Assignment 2**

In [None]:
import sys
from pathlib import Path

# Adjust the path below according to your project's structure
# This should be the path to the directory containing 'notebooks' and 'utils'
project_root = Path.cwd().parent  # If your notebook is directly inside the 'notebooks' directory
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import utils.utility as utility
import imputers.latest_credit_pull_d as lcpd
import imputers.earliest_cr_line_d as ecld
import imputers.last_pymnt_d as lpd
import imputers.numeric_data as nd

# We first import a number of libraries that we will be using in today's class
import pandas as pd
import numpy as np

# Plotting packages we'll use
import matplotlib.pyplot as plt
import seaborn as sns

# Rather than importing the whole sklearn library, we will import only certain modules
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics, model_selection
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## *Importing Data*

In [None]:
path_to_rejected = '../data/LendingClub/rejected_2007_to_2018Q4.csv'
path_to_accepted = '../data/LendingClub/accepted_2007_to_2018Q4.csv'

## *Read rejected and accpeted data into two seperate dataframes*

In [None]:
# Create DFs
df_rejected = pd.read_csv(path_to_rejected)
df_accepted = pd.read_csv(path_to_accepted)


In [None]:
# Check rows and columns
print(df_accepted.shape)
print(df_rejected.shape)
# 3. output first 5 observations
#df_rejected.head()
df_accepted.head()

## *Examine features* 

In [None]:
df_accepted.info(verbose=True, show_counts=True)
df_rejected.info(verbose=True, show_counts=True)


## *Droping columns*

### *Dropping irrelevant columns*

In [None]:
# Get positions of irrelevant columns
rejected_positions_to_drop = [2, 8]
accepted_positions_to_drop = [0, 1, 18, 19, 55, 21, 10, 22]

# Get the column names based on their positions
rejected_columns_to_drop = df_rejected.columns[rejected_positions_to_drop]
accepted_columns_to_drop = df_accepted.columns[accepted_positions_to_drop]

# Drop irrelevant columns
df_rejected = df_rejected.drop(rejected_columns_to_drop, axis=1)
df_accepted = df_accepted.drop(accepted_columns_to_drop, axis=1)


### *Dropping columns with a high null count*

In [None]:
# Drop where the null count is greater than or equal to 50% of samples.
df_accepted = utility.drop_null_columns(df_accepted)
df_rejected = utility.drop_null_columns(df_rejected)

In [None]:
# Check rows and columns
print(df_accepted.shape)
print(df_rejected.shape)

In [None]:
print(list(df_accepted.columns))

## Drop outliers

In [None]:
df_accepted = utility.remove_outliers(df=df_accepted)

### Examine data's distribution

In [None]:
# Get some summary statistics
df_rejected.describe()
df_accepted.describe()

In [None]:
# Get the columns with high skew values as these need to be fixed later
skewed_columns = utility.get_high_skewed_columns(df=df_accepted)
print(skewed_columns)

In [None]:
# Visualize numerical data for accepted data
utility.visualize_numerical_variables(df_accepted)

In [None]:
# Visualize numerican data for rejected data
utility.visualize_numerical_variables(df_rejected)

##  Imputing missing values 

### Imputing dates

In [None]:
# Get a list of all columns that are dates 
potential_date_columns = utility.get_category_columns(df_accepted)

In [None]:
# Display each object with its value count
value_counts = utility.display_value_counts(df_accepted, potential_date_columns)
value_counts

#### Observations from Value Counts
*Upon looking at the value counts, the following observations were made:*

- There are four dates: 'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'
- These values must be properly dealt with to be used in our analysis
- issue_d: 
    - Can not be imputed. Drop all rows where loan_d is null
- earliest_cr_line:
    - Should be imputed based on


In [None]:
# Drop all rows where issue_d is null
df_accepted = df_accepted.dropna(subset=['issue_d'])

In [None]:
# Type cast all date columns to date objects
date_columns = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d']
df_accepted = utility.to_datetime(df=df_accepted, columns=date_columns)

In [None]:
# Impute missing dates
ecld_imputer = ecld.EarliestCRLineDateImputer()
ecld_imputer.fit(X=df_accepted)
df_accepted = ecld_imputer.transform(X=df_accepted)

lcpd_imputer = lcpd.LatestCreditPullDateImputer()
lcpd_imputer.fit(X=df_accepted)
df_accepted = lcpd_imputer.transform(X=df_accepted)

lpd_imputer = lpd.LastPaymentDateImputer()
lpd_imputer.fit(X=df_accepted)
df_accepted = lpd_imputer.transform(X=df_accepted)

### Categorical Data

In [None]:
# Drop loan_status: Does not meet the credit policy. Status:Fully Paid, Does not meet the credit policy. Status:Charged Off,
# emp_length: 10+ years, < 1 year
df_accepted = df_accepted[(df_accepted.emp_length != '10+ years') &
                          (df_accepted.emp_length != '< 1 year')]

In [None]:
# Make the rest categorical 
category_columns = utility.get_category_columns(df=df_accepted)
result_list = [item for item in category_columns if item not in date_columns]
df_accepted = utility.to_categorical(df=df_accepted, columns=result_list)

In [None]:
df_accepted.head()

### Imputing categorical data
- Going to start off with SimpleImputer.
- Change later to more complex and specific imputers.

In [None]:
print(category_columns)

In [None]:
df_accepted.info(verbose=True, show_counts=True)

In [None]:
# Impute categorical data using mode
imputer = SimpleImputer(strategy='most_frequent')
df_accepted[category_columns] = imputer.fit_transform(df_accepted[category_columns])

### Imputing numerical data
*Lets breakdown each numerical column and make a decision on what imputation strategy would be best:*

- `loan_amnt`: drop. Too important not to have and the number of rows without this amount is small.
- `funded_amnt`: 

In [None]:
imputer = nd.NumericDataImputer(group_column='loan_status')
imputer.fit(df_accepted)
df_accepted = imputer.transform(df_accepted)

In [None]:
# Save a copy of pre-engineered features
pre_fe_df_accepted = df_accepted.copy(deep=True)

## Feature Engineering

### Feature Engineering Dates Data

In [None]:
# Feature engineer credit history length and loan age
collection_date = pd.to_datetime('2018-12-31')
df_accepted['credit_history_length'] = (df_accepted['issue_d'] - df_accepted['earliest_cr_line']).dt.days / 365.25
df_accepted['loan_age'] = (df_accepted['last_pymnt_d'] - df_accepted['issue_d']).dt.days / 30
df_accepted['months_since_last_credit_pull'] = (collection_date - df_accepted['last_credit_pull_d']).dt.days / 30

In [None]:
# Drop dates to reduce complexity
df_accepted = utility.drop_nan(df_accepted, date_columns)

In [None]:
df_accepted.info(verbose=True, show_counts=True)

## *Examine target variable*

In [None]:
df_accepted = utility.calculate_roi(df=df_accepted)