### Debt Prediction

#### a) Read dataset into pandas dataframe

In [26]:
import pandas as pd
df = pd.read_csv("https://github.com/mpourhoma/cs4661/raw/master/Credit.csv")

#### b) Check out the dataframe

In [27]:
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Married,Balance
0,14.891,3606,283,2,34,11,1,333
1,106.025,6645,483,3,82,15,1,903
2,104.593,7075,514,4,71,11,0,580
3,148.924,9504,681,3,36,11,0,964
4,55.882,4897,357,2,68,16,1,331


#### c) Generate feature matrix and target vector, normalize the feature matrix

In [28]:
import sklearn.preprocessing as pre_processor
X = df[list(df.drop(['Balance'], axis=1).columns)]
X = pre_processor.scale(X)

print(X)

y = df['Balance']
print(y)


[[-0.86158299 -0.48999879 -0.46553881 ... -1.2576741  -0.78492991
   0.79539491]
 [ 1.72743711  0.82826106  0.82870309 ...  1.5284506   0.49658831
   0.79539491]
 [ 1.68675551  1.01478681  1.02931059 ...  0.88996369 -0.78492991
  -1.25723711]
 ...
 [ 0.35946155 -0.24491264 -0.21963285 ...  0.65778663 -0.46455035
   0.79539491]
 [-0.21280808 -0.95891584 -1.05441888 ... -0.67723146 -0.1441708
   0.79539491]
 [-0.75334493  0.34199278  0.38866085 ...  0.48365384 -2.06644812
  -1.25723711]]
0       333
1       903
2       580
3       964
4       331
5      1151
6       203
7       872
8       279
9      1350
10     1407
11        0
12      204
13     1081
14      148
15        0
16        0
17      368
18      891
19     1048
20       89
21      968
22        0
23      411
24        0
25      671
26      654
27      467
28     1809
29      915
       ... 
370     992
371       0
372     840
373    1003
374     588
375    1000
376     767
377       0
378     717
379       0
380     661
381  

#### d) Split data set into testing and training sets with the following parameters:
- test_size=0.24
- random_state=9

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.24, random_state=9)


#### e) Train linear model on training set using linear regression
- Check coefficients
- Which feature is the most important?
- Which feature is least important?

In [30]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# Printing theta 0
print(linear_regression.intercept_)
# Printing the coefficients 
print(list(zip(list(df.drop(['Balance'], axis=1).columns), linear_regression.coef_)))
# This tells us that Rating has the most sig impact for every +/-1 rating has a 400 fold impact
# Whereas the amount of Cards is the least sig impact on predicting debt

532.2638917311483
[('Income', -283.363121844603), ('Limit', 230.16428735859856), ('Rating', 409.0195946570877), ('Cards', -5.857147900534257), ('Age', -7.789636009613738), ('Education', 31.201276157397665), ('Married', -25.812396541416994)]


#### f) Predicting balance for users in testing set
- Compare the balance with the actual balance by cal and reporting the RMSE

In [31]:
from sklearn import metrics
import numpy as np
y_prediction = linear_regression.predict(X_test)
print(y_prediction)

mse = metrics.mean_squared_error(y_test, y_prediction)
rmse = np.sqrt(mse)
print(rmse)

[ 2.94819820e+02  7.12080730e+02  5.13342785e+02 -4.74492413e+01
  7.05446039e+02  8.63997768e+02  9.61184537e+02  1.62010505e+02
 -2.13074953e+02  8.63732086e+02  3.82193495e+01 -2.67920861e+02
  4.73537525e+02 -1.47563416e+01 -2.22751289e+02  9.77660223e+02
  8.90798146e+02  8.34387370e+02  7.25560422e+01  8.80196421e+02
  1.04471552e+03  7.67729379e+02  1.25213282e+03  7.20174819e+02
  7.45742288e+02  6.93416280e+02  5.94486550e+02 -8.93760512e+01
  5.93967073e+02  4.16055842e+02  8.88468441e+02  8.74532173e+02
  6.08560565e+01  9.44241094e+02 -2.15800917e+02  5.24585973e+02
  1.04868990e+03  4.58821682e+02  9.79281480e+01 -1.29014116e+02
  5.20697865e+02  1.18206831e+03  3.66927260e+02  3.56121239e+02
  8.16774926e+01  7.27380768e+02  4.02884591e+02  1.33679997e+03
  5.44286498e+02  1.24684319e+02 -2.14822101e+02  5.28877744e+00
  3.18689224e+02  8.73760344e+02  8.78664863e+02  1.05845857e+03
  1.53315645e+03  1.13145189e+03  7.17927140e+02  1.08378986e+03
 -6.71102932e+01  5.07504

In [33]:
from sklearn.model_selection import cross_val_score
mse_list = cross_val_score(LinearRegression(), X, y, scoring='neg_mean_squared_error')
print(np.sqrt(-mse_list).mean())

163.2743679627569


