In [None]:
import pandas as pd

#Convert accepted_loans data to Panda DataFrame
accepted_df = pd.read_csv("/Users/abubakaral-faki/Documents/Data Project/MPV1/data/raw/accepted_2007_to_2018Q4.csv")

In [None]:
#create a copy of data frame to work with temporarily before applying all

accepted_df_copy = pd.DataFrame(accepted_df)
print(accepted_df_copy.head())

In [None]:
# Get data types of each variable
variable_data_types_df = accepted_df.dtypes.reset_index()

# Rename columns
variable_data_types_df.columns = ['Variables', 'Data Type']
print(variable_data_types_df)

#Save as csv
try:
    file_path = '/Users/abubakaral-faki/Documents/Data Project/MPV1/temp_files/variables_data_type.csv'
    
    variable_data_types_df.to_csv(file_path, index = False)
    print("Successfully saved variable_data_types_df as a csv.")

except Exception as e:
    print(e)

In [None]:
# Convert Date_columns to Date Data types
date_columns = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d',
                'last_credit_pull_d', 'sec_app_earliest_cr_line', 'hardship_start_date',
                'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date', 'settlement_date']

print(len(date_columns))

In [None]:
# Check format of date before converting to date objects

# We use dropna to drop all missing values in a columns because sometimes the first few rows have missing values

accepted_df.settlement_date.dropna().head() # Date format "%b-$Y"

In [None]:
# Specify the correct format: %b for abbreviated month name, %Y for 4-digit year
date_format = "%b-%Y"

# Convert date columns efficiently using the specified format
accepted_df_copy[date_columns] = accepted_df_copy[date_columns].apply(lambda col: pd.to_datetime(col,format =  date_format, errors = 'coerce'))

print(accepted_df_copy.issue_d.head())

In [None]:
# Check if all date columns are in date format
accepted_df_copy[date_columns].dtypes

In [None]:
# Check format of date columns
print(accepted_df_copy.settlement_date.dropna().head())

In [None]:
# Convert Categorical columns to category

categorical_columns = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
                       'verification_status', 'loan_status', 'pymnt_plan', 'purpose',
                       'title', 'addr_state', 'initial_list_status', 'application_type',
                       'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status',
                       'hardship_loan_status', 'disbursement_method', 'debt_settlement_flag',
                       'settlement_status']

accepted_df_copy[categorical_columns] = accepted_df_copy[categorical_columns].apply(lambda col: col.astype('category'))

In [None]:
# Check if categorical columns are of categorical data type

accepted_df_copy[categorical_columns].dtypes

In [None]:
# the title column will require some cleaning because the categories appear in a messiy format so I am converting 
# back to a string

'''
['\tcredit_card', '\tdebt_consolidation', '\thouse', '\tother', ..., 'zxcvb', '~Life Reorganization~', 
'~Summer Fun~', 'îîMY FIRST CAR îî']

'''

accepted_df_copy['title']

In [None]:
# Convert title column to string
accepted_df_copy['title'].astype(str)

In [None]:
# Replace 'nan' strings with 'unknown'
accepted_df_copy['title'] = accepted_df_copy['title'].replace('nan', 'unknown')

# fill NaN values (missing values) with 'unknown'
accepted_df_copy['title'] = accepted_df_copy['title'].fillna('unknown')

In [None]:
accepted_df_copy['title']

In [None]:
# Check if 'title' column is of type 'object' type because pands represents string columns as object
print(accepted_df_copy['title'].dtype)

In [None]:
#Convert the id column to int

#check if 'id' column has any missing values
print(accepted_df_copy['id'].describe())

#convert id column from objext(text) to int

'''This code below didn't work because there is a row that is a string representing this value;
 'Total amount funded in policy code 1: 6417608175'
'''

accepted_df_copy['id'] = accepted_df_copy['id'].astype(int)

In [None]:
# find problematic row in 'id' column stopping it from being an int column
problematic_row = accepted_df_copy[accepted_df_copy['id'] == 'Total amount funded in policy code 1: 6417608175'].index

# Remove problematics row
accepted_df_copy = accepted_df_copy.drop(index = problematic_row[0])

In [None]:
# identify all rows in 'id' column that are not of type int

non_int_rows = accepted_df_copy[~accepted_df_copy['id'].apply(lambda x: str(x).isdigit())] #32 rows

print(non_int_rows)

In [None]:
# check if all columns associated with the non_int_rows in 'id' column are empty

col_not_empty = non_int_rows.drop(columns=['id']).isna().all(axis=1)

print(col_not_empty)

#print(non_int_empty)

''' 
The reults shows that the are columns that are not empty for rows where the 'id' is not an integer
so we have to identify which columns are not empty for those rows
'''

In [233]:
# Identify non-empty rows in other columns associated with non-int rows in 'id'

# Step 1 - Identify Non-int rows
non_int_rows = accepted_df_copy[~accepted_df_copy['id'].apply(lambda row: str(row).isdigit())] #32 rows

print(non_int_rows)

                                                       id  member_id  \
421096   Total amount funded in policy code 2: 1944088810        NaN   
528961   Total amount funded in policy code 1: 1741781700        NaN   
528962    Total amount funded in policy code 2: 564202131        NaN   
651664   Total amount funded in policy code 1: 1791201400        NaN   
651665    Total amount funded in policy code 2: 651669342        NaN   
749520   Total amount funded in policy code 1: 1443412975        NaN   
749521    Total amount funded in policy code 2: 511988838        NaN   
877716   Total amount funded in policy code 1: 2063142975        NaN   
877717    Total amount funded in policy code 2: 823319310        NaN   
983169   Total amount funded in policy code 1: 1538432075        NaN   
983170    Total amount funded in policy code 2: 608903141        NaN   
1117058  Total amount funded in policy code 1: 2087217200        NaN   
1117059   Total amount funded in policy code 2: 662815446       

In [None]:
# Step 2 - Check if rows in other columns associated with non-int rows are also empty so we don't drop
# important information

empty_columns_check = non_int_rows.drop(columns = 'id').isna().all(axis = 1)

print(empty_columns_check)

In [242]:
# Step 3 - Idetify which columns associated with the non-int rows in 'id' column are not empty

columns_not_empty = non_int_rows.drop(columns = 'id').columns[non_int_rows.drop(columns = 'id').notna().any()]

print(columns_not_empty)

Index(['title'], dtype='object')


In [244]:
non_int_rows[['id', columns_not_empty[0]]]

Unnamed: 0,id,title
421096,Total amount funded in policy code 2: 1944088810,unknown
528961,Total amount funded in policy code 1: 1741781700,unknown
528962,Total amount funded in policy code 2: 564202131,unknown
651664,Total amount funded in policy code 1: 1791201400,unknown
651665,Total amount funded in policy code 2: 651669342,unknown
749520,Total amount funded in policy code 1: 1443412975,unknown
749521,Total amount funded in policy code 2: 511988838,unknown
877716,Total amount funded in policy code 1: 2063142975,unknown
877717,Total amount funded in policy code 2: 823319310,unknown
983169,Total amount funded in policy code 1: 1538432075,unknown


In [259]:
# Drop all non-int rows in id colums

#Get shape of accepted_df Dataframe before dropping rows
print('accepted_df shape before removing rows:', accepted_df_copy.shape)


# Step 1 - Get index of non-int rows in 'id' column to drop
nonint_rows_todrop = non_int_rows.index

print()
print(nonint_rows_todrop)
print(len(nonint_rows_todrop), 'rows to remove\n\n')

# Step 2 - remove non-int rows in 'id' column from accepted_df_copy

accepted_df_copy = accepted_df_copy.drop(index = nonint_rows_todrop)
print(len(nonint_rows_todrop), 'successfully dropped\n\n')

# Step 3 - Check accepted_df shape to confirm if 32 rows were dropped

print('accepted_df shape after removing rows:', accepted_df_copy.shape)


accepted_df shape before removing rows: (2260700, 151)

Index([ 421096,  528961,  528962,  651664,  651665,  749520,  749521,  877716,
        877717,  983169,  983170, 1117058, 1117059, 1352689, 1352690, 1481103,
       1481104, 1611877, 1611878, 1651665, 1654415, 1654416, 1751196, 1751197,
       1939379, 1939380, 2038501, 2038502, 2157151, 2157152, 2260699, 2260700],
      dtype='int64')
32 rows to remove


32 successfully dropped


accepted_df shape after removing rows: (2260668, 151)


In [261]:
# Check if we still have non-int rows in 'id' column

accepted_df_copy[~accepted_df_copy['id'].apply(lambda row: str(row).isdigit())] - #Should return an empty DF

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term


In [262]:
# Soooo now we can finally Convert 'id' column to integer and it should work fingers crossed
accepted_df_copy['id'] = accepted_df_copy['id'].astype(int)

In [264]:
accepted_df_copy['id'].info()

<class 'pandas.core.series.Series'>
Index: 2260668 entries, 0 to 2260698
Series name: id
Non-Null Count    Dtype
--------------    -----
2260668 non-null  int64
dtypes: int64(1)
memory usage: 34.5 MB


In [289]:
#Get data types of all variables and convert it to a DataFrame
new_data_types = pd.DataFrame(accepted_df_copy.dtypes)

# Reset index of Dataframe
new_data_types = new_data_types.reset_index()

#define column names
new_data_types.columns = ['variable', 'type']

#
print(new_data_types.head())

#Get variables that are type 'object'
object_vars = new_data_types[new_data_types['type'] == 'object']

#Get all types
new_data_types['type'] = new_data_types['type'].astype('category')

print(object_vars)


          variable     type
0               id    int64
1        member_id  float64
2        loan_amnt  float64
3      funded_amnt  float64
4  funded_amnt_inv  float64
                     variable    type
10                  emp_title  object
18                        url  object
19                       desc  object
21                      title  object
22                   zip_code  object
59  verification_status_joint  object


In [288]:
print(new_data_types['type'].cat.categories)

Index([         int64,        float64,       category,       category,
             category,         object,       category,       category,
             category, datetime64[ns],       category,       category,
             category,       category,       category,       category,
             category,       category,       category,       category,
             category,       category,       category],
      dtype='object')


In [297]:
# Convert emp_title to category or leave as oject

accepted_df_copy['emp_title'] = accepted_df_copy['emp_title'].astype('category')

#Check type of emp_title
accepted_df_copy['emp_title'].cat.categories

Index(['\tCFO', '\tMultimedia Supervisor', '\tSlot technician',
       '\tVP - Operations', ' ', ' \tASR II', ' \tAdv Mtr Proj Fld Rep',
       ' \tAuto Body Repair', ' \tDriver', ' \tEmployee Strategies Manager',
       ...
       'zueck transportation', 'zulily', '{Owner}Truck Driver',
       '| Principal Business Solution Architect|',
       'År.  Technical Illustrator', '​Associate Tech Support Analyst',
       '​Financial Analyst', '​License Compliance Investigator',
       '​Senior IT Field Support', '👨‍🍳 '],
      dtype='object', length=512694)

In [None]:
import re

def