In [5]:
#!pip install pandas boto3 sklearn

In [1]:
import pandas as pd
import boto3
from io import StringIO
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.datasets import dump_svmlight_file
from sagemaker.amazon.amazon_estimator import get_image_uri
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### AWS Information

In [2]:
aws_access_key_id = ''
aws_secret_access_key = ''
bucket_name = 'testbucketforcreditproject'
file_key = 'defaultofcreditcardclients.csv'

### Connect AWS & Get Data

In [3]:
s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

In [24]:
s3_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
csv_content = s3_response['Body'].read().decode('utf-8')

In [5]:
#Create Dataframe
df = pd.read_csv(StringIO(csv_content))

### Dataset Analysis

In [6]:
df.shape

(30000, 25)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          30000 non-null  int64
 1   LIMIT_BAL                   30000 non-null  int64
 2   SEX                         30000 non-null  int64
 3   EDUCATION                   30000 non-null  int64
 4   MARRIAGE                    30000 non-null  int64
 5   AGE                         30000 non-null  int64
 6   PAY_0                       30000 non-null  int64
 7   PAY_2                       30000 non-null  int64
 8   PAY_3                       30000 non-null  int64
 9   PAY_4                       30000 non-null  int64
 10  PAY_5                       30000 non-null  int64
 11  PAY_6                       30000 non-null  int64
 12  BILL_AMT1                   30000 non-null  int64
 13  BILL_AMT2                   30000 non-null  int64
 14  BILL_A

In [8]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [9]:
df.nunique()

ID                            30000
LIMIT_BAL                        81
SEX                               2
EDUCATION                         7
MARRIAGE                          4
AGE                              56
PAY_0                            11
PAY_2                            11
PAY_3                            11
PAY_4                            11
PAY_5                            10
PAY_6                            10
BILL_AMT1                     22723
BILL_AMT2                     22346
BILL_AMT3                     22026
BILL_AMT4                     21548
BILL_AMT5                     21010
BILL_AMT6                     20604
PAY_AMT1                       7943
PAY_AMT2                       7899
PAY_AMT3                       7518
PAY_AMT4                       6937
PAY_AMT5                       6897
PAY_AMT6                       6939
default payment next month        2
dtype: int64

### Feature Engineering

Sex: Convert to Boolean, representing Male.

Education: Replace zeroes and missing values with 'Others,' then perform one-hot encoding.

Marriage: Convert 0 and missing values to 'Married,' and then represent marital status using Boolean values.

Age: Cap entries exceeding 80 to 80, then normalize the entire column by dividing by 80.

PAY_0 - PAY_6: Adjust the range of -2 to 8 by adding 2, then normalize by dividing by 10 to obtain values between 0 and 1.

In [10]:
sex_d = {'male': True,'female': False}
df['SEX']=df['SEX'].replace(sex_d)
df['SEX'] = df['SEX'].astype(int)
df['SEX'].head(10)

0    2
1    2
2    2
3    2
4    1
5    1
6    1
7    2
8    2
9    1
Name: SEX, dtype: int64

In [11]:
df['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

In [12]:
df['EDUCATION']=df['EDUCATION'].fillna('others')
edu_d={'0':'others'}
df['EDUCATION']=df['EDUCATION'].replace(edu_d)
df['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

In [13]:
encoder=OneHotEncoder(sparse_output=False)
df_encoded = pd.DataFrame (encoder.fit_transform(df[['EDUCATION']]))
df_encoded.columns = encoder.get_feature_names_out(['EDUCATION'])
df.drop(['EDUCATION'] ,axis=1, inplace=True)
df_OH= pd.concat([df, df_encoded ], axis=1)
df = df_OH
df.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6
0,1,20000,2,1,24,2,2,-1,-1,-2,...,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,120000,2,2,26,-1,2,0,0,0,...,0,2000,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,90000,2,2,34,0,0,0,0,0,...,1000,5000,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,50000,2,1,37,0,0,0,0,0,...,1069,1000,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,50000,1,1,57,-1,0,-1,0,0,...,689,679,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
df['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

In [15]:
df['MARRIAGE'].unique()

array([1, 2, 3, 0])

In [16]:
df['MARRIAGE']=df['MARRIAGE'].fillna('married')
mar_d = {'married': True,'0': True, 'single': False}
df['MARRIAGE']=df['MARRIAGE'].replace(mar_d)
df['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

In [17]:
df['AGE'] = np.where(df['AGE'] > 80, 80, df['AGE'])

In [18]:
df['AGE']=df['AGE']/80
df['AGE'].head(10)

0    0.3000
1    0.3250
2    0.4250
3    0.4625
4    0.7125
5    0.4625
6    0.3625
7    0.2875
8    0.3500
9    0.4375
Name: AGE, dtype: float64

In [19]:
for col in df.columns:
    print(col)

ID
LIMIT_BAL
SEX
MARRIAGE
AGE
PAY_0
PAY_2
PAY_3
PAY_4
PAY_5
PAY_6
BILL_AMT1
BILL_AMT2
BILL_AMT3
BILL_AMT4
BILL_AMT5
BILL_AMT6
PAY_AMT1
PAY_AMT2
PAY_AMT3
PAY_AMT4
PAY_AMT5
PAY_AMT6
default payment next month
EDUCATION_0
EDUCATION_1
EDUCATION_2
EDUCATION_3
EDUCATION_4
EDUCATION_5
EDUCATION_6


### Prepare Data for model

In [20]:
columns_titles = ["default payment next month", "ID", "LIMIT_BAL",
                  "SEX", "MARRIAGE", "AGE",
                  "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6",
                  "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                  "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
                  "PAY_AMT1", "PAY_AMT2", "PAY_AMT3",
                  "PAY_AMT4", "PAY_AMT5", "PAY_AMT6",
                  "EDUCATION_0", "EDUCATION_1", "EDUCATION_2",
                  "EDUCATION_3", "EDUCATION_4", "EDUCATION_5",
                  "EDUCATION_6"
                 ]
df = df.reindex(columns=columns_titles)

In [21]:
train, testandval = train_test_split(df, test_size=0.2)
test, validate = train_test_split(testandval, test_size=0.5)

### Save Data to s3

In [28]:
s3_resource = boto3.Session().resource('s3')

In [29]:
def upload_s3_csv(filename, dataframe):
    csv_buffer = StringIO()
    dataframe.to_csv(csv_buffer, index=False)
    s3_resource.Object(bucket_name, filename).put(Body=csv_buffer.getvalue())

In [30]:
upload_s3_csv('train_data.csv', train)
upload_s3_csv('test_data.csv', test)
upload_s3_csv('validate.csv', validate)