### Team members: 

## Content
1. Introduction
2. Data preprocessing
3. Modelling
4. Data visualization

In [14]:
import keras, tensorflow, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.wrappers.scikit_learn import KerasRegressor

In [149]:
df = pd.read_csv('Credit.csv')
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


### Introduction


In [150]:
df.shape

(400, 11)

In [151]:
df.isnull().sum()

Income       0
Limit        0
Rating       0
Cards        0
Age          0
Education    0
Gender       0
Student      0
Married      0
Ethnicity    0
Balance      0
dtype: int64

### Data preprocessing

In [152]:
# Dummy variables
df_Ethnicity = pd.get_dummies(df.Ethnicity)

# Merging two dataframes by columns
df = pd.concat([df, df_Ethnicity], axis = 1)

# Using Label Encoder to convert categorical data into numerical
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Student'] = le.fit_transform(df['Student'])
df['Married'] = le.fit_transform(df['Married'])

df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,African American,Asian,Caucasian
0,14.891,3606,283,2,34,11,0,0,1,Caucasian,333,0,0,1
1,106.025,6645,483,3,82,15,1,1,1,Asian,903,0,1,0
2,104.593,7075,514,4,71,11,0,0,0,Asian,580,0,1,0
3,148.924,9504,681,3,36,11,1,0,0,Asian,964,0,1,0
4,55.882,4897,357,2,68,16,0,0,1,Caucasian,331,0,0,1


In [153]:
# Checking the correlation of features corresponding to our target 
print(df.corr().Balance.sort_values(ascending = False))

Balance             1.000000
Rating              0.863625
Limit               0.861697
Income              0.463656
Student             0.259018
Cards               0.086456
Gender              0.021474
African American    0.013720
Age                 0.001835
Caucasian          -0.003288
Married            -0.005673
Education          -0.008062
Asian              -0.009812
Name: Balance, dtype: float64


In [154]:
# Dropping the unnecessary columns 
df = df.drop(['Asian','Education','Caucasian','Age','African American','Cards','Ethnicity','Gender','Student','Married'], axis = 1)

# Normalization
for x in df[['Income','Rating','Limit','Balance']]:
    df[x] = (df[x]-min(df[x]))/max(df[x]-min(df[x]))


KeyError: ignored

### Modelling

In [155]:
X = df.drop(['Balance'], axis = 1)
y = df['Balance']

# Writing out train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 0)
 
X_train = np.array(X_train)
y_train = np.array(y_train)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

model = Sequential()  
model.add(LSTM(50, return_sequences=True, activation='relu', input_shape = (X_train.shape[1], 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))



In [156]:
model.compile(optimizer='adam',loss='mean_squared_error', metrics = ['accuracy'])
model.fit(X_train, y_train, batch_size=1 ,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd838f2db50>

In [157]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_20 (LSTM)               (None, 3, 50)             10400     
_________________________________________________________________
lstm_21 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_20 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 26        
Total params: 31,901
Trainable params: 31,901
Non-trainable params: 0
_________________________________________________________________


In [158]:
# Converting data into numpy array
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Get predicted values for balance
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

0.006983087306360707


In [159]:
y_pred = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred)

In [160]:
train = data.loc[:training_data_len]
valid = data.loc[training_data_len:]
valid['Predicted balance'] = y_pred
valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Balance,Predicted balance
320,5,877.562012
321,81,928.763000
322,265,526.294983
323,1999,887.340942
324,415,354.452148
...,...,...
395,560,372.254730
396,480,768.029785
397,138,1030.196655
398,0,555.878296


### Data visualization

In [None]:
plt.figure(figsize = (16,8))
plt.title('Model')
plt.xlabel('')