In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

## Part A ##

**Read the dataset file “Credit.csv”**

In [2]:
df  = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Credit.csv")

In [3]:
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Married,Balance
0,14.891,3606,283,2,34,11,1,333
1,106.025,6645,483,3,82,15,1,903
2,104.593,7075,514,4,71,11,0,580
3,148.924,9504,681,3,36,11,0,964
4,55.882,4897,357,2,68,16,1,331


## Part C ##

**Generate the feature matrix and target vector (target is “balance” in this dataset)**

In [4]:
# Generating feature matrix
features = df.drop(columns=["Balance"])  #all columns except "balance"

In [5]:
#Generating target vector
target = df["Balance"]

In [6]:
print("Features (X):")
print(features.head())

print("\nTarget (y):")
print(target)

Features (X):
    Income  Limit  Rating  Cards  Age  Education  Married
0   14.891   3606     283      2   34         11        1
1  106.025   6645     483      3   82         15        1
2  104.593   7075     514      4   71         11        0
3  148.924   9504     681      3   36         11        0
4   55.882   4897     357      2   68         16        1

Target (y):
0      333
1      903
2      580
3      964
4      331
      ... 
395    560
396    480
397    138
398      0
399    966
Name: Balance, Length: 400, dtype: int64


## Part D ##

**Split the dataset into testing and training sets with the following parameters: test_size=0.24, random_state=9.**

In [7]:
# Split the dataset into training and testing sets with corresponding parameters
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.24, random_state=9)

## Part E ##
**Now, normalize (scale) the features. To normalize the data, you can simply use StandardScaler  from sklearn. (note: don’t normalize the target!). Remember that we can only use the statistics of X_train for normalization, and then apply it on both X_train and X_test**

In [8]:
# Normalizing the data
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

## Part F ##
**Use Linear Regression to train a linear model on the training set. Check the coefficients of the linear regression model. Which feature is the most important? Which feature is the least important?**

In [9]:
# Initializing the Linear Regression model
my_linreg = LinearRegression()

# Training the model on the training set
my_linreg.fit(X_train_normalized, y_train)

In [10]:
# Checking the coefficients of the linear regression model
print("Coefficient: ", my_linreg.coef_)

Coefficient:  [-280.2104277   373.75795782  261.94540404   24.83810299  -17.50870489
   11.55110225   -9.90194703]


In [11]:
# Checking the Interception of the linear regression model
print("Interception: ", my_linreg.intercept_)

Interception:  519.8585526315788


## Which feature is the most important? Which feature is the least important? ##

The most important feature, the one with the largest positive coefficient, is Limit
<br>The least important feature, the one with the largest negative coefficient, is Married

## Part G ##
**Predict “balance” for the users in testing set. Then, compare the predicted balance with the actual balance by calculating and reporting the RMSE (as we saw in lab tutorial 4)**

In [12]:
y_prediction = my_linreg.predict(X_test_normalized)

print(y_prediction)

[ 316.89813114  677.5093907   489.03669272  -49.82989971  706.42913216
  864.96954115  897.65752581  108.20297732 -213.73474279  834.94198677
   11.05109139 -227.34447869  472.01450822    4.54554458 -228.17349827
  956.21537845  896.91570524  821.97824267   18.37144232  857.31517013
 1028.30326279  699.22168497 1176.9426756   666.28578465  643.94064891
  690.63628696  589.35273903  -29.37237394  575.15444917  406.78468032
  845.81453327  828.30862904   82.2376206   953.40592514 -184.74828227
  525.84335889 1045.19205022  535.04104472  107.26753118 -172.8968111
  497.84037664 1155.35533939  429.08674601  403.49872552  143.72087313
  764.78737012  431.24114308 1304.79656588  475.00132057   95.75404622
 -215.69000296    5.62900657  313.88797405  859.9328932   792.27147016
 1029.90210771 1500.40180297 1078.48617624  713.37981473 1072.97948262
  -64.12187718  488.60063387   92.3601712   440.414465    277.13284667
 1075.41508762  258.10181743  773.89657462  278.12447916  409.51029492
  335.5

In [13]:
from sklearn import metrics
import numpy as np

# Calculating "Mean Square Error" (MSE):
mse = metrics.mean_squared_error(y_test, y_prediction)
rmse = np.sqrt(mse)

print(rmse)

143.2464973964228


## Part H ##
**Now, use 10-fold Cross-Validation to evaluate the performance of a linear regression in predicting the balance. Thus, rather than splitting the dataset into testing and training, use Cross-Validation to evaluate the regression performance. Note that the implementation of CV along with Normalization is a little tricky because we need to normalize (Scale) the data for each round of the CV individually! Fortunately, sklearn offers pipelines for these types of problems (below). What is the RMSE when you use cross validation?**

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

clf = make_pipeline(StandardScaler(), my_linreg)
mse_list = cross_val_score(clf, features, target, cv=10, scoring='neg_mean_squared_error')

In [17]:
print(mse_list)

[-23646.90415343 -32003.04401232 -35462.64435619 -37327.60719635
 -14341.32205939 -33628.37104224 -31631.99317834 -12491.00334951
 -20749.61212175 -23204.94743459]


In [18]:
mse_list_pos = - mse_list
rmse_list = np.sqrt(mse_list_pos)

print(rmse_list.mean())

160.3319891074414
