# DS-SF-25 | Lab 07 | Introduction to Regression and Model Fit, Part 2

In [33]:
import os

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import feature_selection, linear_model

In [34]:
df = pd.read_csv(os.path.join('..', 'datasets', 'credit.csv'))

In [35]:
df

Unnamed: 0,Income,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,514,4,71,11,Male,No,No,Asian,580
3,148.924,681,3,36,11,Female,No,No,Asian,964
4,55.882,357,2,68,16,Male,No,Yes,Caucasian,331
...,...,...,...,...,...,...,...,...,...,...
395,12.096,307,3,32,13,Male,No,Yes,Caucasian,560
396,13.364,296,5,65,17,Male,No,No,African American,480
397,57.872,321,5,67,12,Female,No,Yes,Caucasian,138
398,37.728,192,1,44,13,Male,No,Yes,Caucasian,0


A description of the dataset is as follows:

- Income (in thousands of dollars)
- Rating: Credit score rating
- Cards: Number of Credit cards owned
- Age
- Education: Years of Education
- Gender: Male/Female
- Student: Yes/No
- Married: Yes/No
- Ethnicity: African American/Asian/Caucasian
- Balance: Average credit card debt

> ## Question 1.  Let's explore the quantitative variables that affect `Balance`.  From your preliminary analysis, which 2 variables seem to affect `Balance` the most?  Our goal is interpretation; can we use these 2 variables simultaneously?  Why or why not?

In [36]:
df.corr()
# Rating and income - but these two variables seem to correlate with each other.

Unnamed: 0,Income,Rating,Cards,Age,Education,Balance
Income,1.0,0.791378,-0.018273,0.175338,-0.027692,0.463656
Rating,0.791378,1.0,0.053239,0.103165,-0.030136,0.863625
Cards,-0.018273,0.053239,1.0,0.042948,-0.051084,0.086456
Age,0.175338,0.103165,0.042948,1.0,0.003619,0.001835
Education,-0.027692,-0.030136,-0.051084,0.003619,1.0,-0.008062
Balance,0.463656,0.863625,0.086456,0.001835,-0.008062,1.0


Answer: TODO

> ## Question 2.  `Race`, `Gender`, `Married`, and `Student` are categorical variables.  Go ahead and create dummy variables for all of them.

In [37]:
# gender
df['female'] = 0
df.loc[df['Gender']=='Female','female'] = 1
print df.female

0      0
1      1
2      0
3      1
4      0
      ..
395    0
396    0
397    1
398    0
399    1
Name: female, dtype: int64


In [38]:
# married
df['Married_new'] = 0
df.loc[df['Married']=='Yes','Married_new'] = 1
print df.Married_new

0      1
1      1
2      0
3      0
4      1
      ..
395    1
396    0
397    1
398    1
399    0
Name: Married_new, dtype: int64


In [39]:
# Student
df['Student_new'] = 0
df.loc[df['Student']=='Yes','Student_new'] = 1
print df.Student_new

0      0
1      1
2      0
3      0
4      0
      ..
395    0
396    0
397    0
398    0
399    0
Name: Student_new, dtype: int64


In [40]:
race_df = pd.get_dummies(df.Ethnicity, prefix = 'Race')

df = df.join([race_df])

In [41]:
df.head()

Unnamed: 0,Income,Rating,Cards,Age,Education,...,Married_new,Student_new,Race_African American,Race_Asian,Race_Caucasian
0,14.891,283,2,34,11,...,1,0,0.0,0.0,1.0
1,106.025,483,3,82,15,...,1,1,0.0,1.0,0.0
2,104.593,514,4,71,11,...,0,0,0.0,1.0,0.0
3,148.924,681,3,36,11,...,0,0,0.0,1.0,0.0
4,55.882,357,2,68,16,...,1,0,0.0,0.0,1.0


> ## Question 3.  Using sklearn and a linear regression, predict `Balance` using `Income`, `Cards`, `Age`, `Education`, `Gender`, and `Race`

First, find the coefficients of your regression line.

In [44]:
X = df[ ['Income', 'Cards', 'Age', 'Education', 'female', 'Race_Asian', 'Race_Caucasian'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_

230.042354393
[  6.27995894  33.62953508  -2.32970547   1.64553607  27.12543123
  -6.54603078   3.47497641]


Then, find the p-values of your F-values' models.  You have a few variables try to show your p-values alongside the names of the variables.

In [46]:
zip(X.columns.values, feature_selection.f_regression(X, y)[1])

[('Income', 1.0308858025893513e-22),
 ('Cards', 0.084176555599370956),
 ('Age', 0.97081387233013317),
 ('Education', 0.87230640156710226),
 ('female', 0.66851610550260099),
 ('Race_Asian', 0.84489564436221742),
 ('Race_Caucasian', 0.94772751139663791)]

> ## Question 4.  Which of your coefficients are significant at the 5% significance level?

Answer: Income only

> ## Question 5.  What is your model's $R^2$?

In [47]:
model.score(X,y)

0.23231260833540465

> ## Question 6.  How do we interpret this value?

Answer: about 23% of the variation of y is explained by the model.

> ## Question 7.  Now let's focus on the two most significant variables from your previous model and re-run your regression model.

In [48]:
X = df[ ['Income', 'Cards'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_

151.329946349
[  6.07099859  31.83812895]


> ## Question 8.  In comparison to the previous model, did the $R^2$ increase or decrease?  Why?

In [49]:
model.score(X , y)

0.22399175162249518

Answer: TODO

> ## Question 9.  Now let's regress `Balance` on `Gender` alone.  After running your linear regressions, do you have enough evidence to claim that females have more balance than males?  (Hint: Look at the p-value of the Gender coefficient.  If it is significant then you will have evidence to support that claim, otherwise you cannot support the statement.)

In [52]:
X = df[ ['female'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_
print feature_selection.f_regression(X, y)[1]

509.803108808
[ 19.73312308]
[ 0.66851611]


Answer: (If your answer is yes, interpret the results).  TODO

> ## Question 10.  Now let's regress `Balance` on `Ethnicity`.  After running your linear regressions, do you have enough evidence to claim that some ethnic groups carry more balance than others?

In [54]:
X = df[ ['Race_Asian', 'Race_Caucasian'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_
print feature_selection.f_regression(X, y)[1]

531.0
[-18.68627451 -12.50251256]
[ 0.84489564  0.94772751]


Answer: (If your answer is yes, interpret the results).  TODO

> ## Question 11.  Finally let's regress `Balance` on `Student`.  After running your linear regressions, do you have enough evidence to claim that students carry more balance than non-students?

In [55]:
X = df[ ['Student_new'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_
print feature_selection.f_regression(X, y)[1]

480.369444444
[ 396.45555556]
[  1.48773411e-07]


Answer: (If your answer is yes, interpret the results).  TODO

> ## Question 12.  No let's consider the effect of `Student` and `Income` on `Balance` simultaneously.  Are all the coefficients significant?

In [56]:
X = df[ ['Income', 'Student_new'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_
print feature_selection.f_regression(X, y)[1]

211.142964398
[   5.98433557  382.67053884]
[  1.03088580e-22   1.48773411e-07]


Answer: (If your answer is yes, interpret the results).  TODO

> ## Question 13.  No let's consider the interaction effect of `Student` and `Income` on `Balance` simultaneously.  Are all the coefficients significant?  It they are, write down your regression model below

(First generate a new variable for the interaction term)

In [63]:
df['interact'] = df.Income*df.Student_new
X = df[ ['Income', 'Student_new','interact'] ]
y = df.Balance

model = linear_model.LinearRegression()
model.fit(X,y)

print model.intercept_
print model.coef_
print feature_selection.f_regression(X, y)[1]

200.62315295
[   6.21816874  476.67584321   -1.99915087]
[  1.03088580e-22   1.48773411e-07   4.61768368e-08]


Answer: TODO

> ## Question 14.  Is there any income level at which students and non-students on average carry same level of balance?

In [64]:
#Answer:
print 476.67584321/1.99915087

238.439154525
