In [1]:
# Data Imports
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# Math
import math

# Plot imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Machine Learning Imports
from statsmodels.discrete import discrete_model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# For evaluating our ML results
from sklearn import metrics

# Dataset Import
import statsmodels.api as sm

In [2]:
data = pd.read_stata('nels_small.dta')

In [3]:
data.head()

Unnamed: 0,psechoice,hscath,grades,faminc,famsiz,parcoll,female,black
0,2,0,9.08,62.5,5,0,0,0
1,2,0,8.31,42.5,4,0,1,0
2,3,0,7.42,62.5,4,0,1,0
3,3,0,7.42,62.5,4,0,1,0
4,3,0,7.42,62.5,4,0,1,0


In [4]:
data.psechoice.value_counts()

3    527
2    251
1    222
Name: psechoice, dtype: int64

# (a)

In [5]:
#generating college variable
data['college'] = data.psechoice.replace([1, 2, 3], [0, 1, 1])

In [6]:
print('\n Value counts for psechoice:\n')
print(data.psechoice.value_counts())

print('\n Value counts for college:\n')
print(data.college.value_counts())


 Value counts for psechoice:

3    527
2    251
1    222
Name: psechoice, dtype: int64

 Value counts for college:

1    778
0    222
Name: college, dtype: int64


In [7]:
print('The percentage of the high school graduates attended college is', 778/(778+222)*100, '$')

The percentage of the high school graduates attended college is 77.8 $


In [8]:
data.grades.min()

1.74

# (b)

In [9]:
y = data.college
X = data.drop(['psechoice', 'college'], axis=1)

In [10]:
probit = sm.Probit(y, X)
results_p = probit.fit()

results_p.summary()

         Current function value: 0.465465
         Iterations: 35




0,1,2,3
Dep. Variable:,college,No. Observations:,1000.0
Model:,Probit,Df Residuals:,993.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 01 May 2019",Pseudo R-squ.:,0.1208
Time:,14:10:11,Log-Likelihood:,-465.46
converged:,False,LL-Null:,-529.43
,,LLR p-value:,3.512e-25

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
hscath,6.5000,6.27e+04,0.000,1.000,-1.23e+05,1.23e+05
grades,-0.1030,0.017,-6.220,0.000,-0.135,-0.071
faminc,0.0085,0.002,4.728,0.000,0.005,0.012
famsiz,0.1628,0.030,5.339,0.000,0.103,0.223
parcoll,0.6527,0.132,4.946,0.000,0.394,0.911
female,0.3183,0.092,3.460,0.001,0.138,0.499
black,0.5187,0.214,2.422,0.015,0.099,0.938



# (c)

In [11]:
grade_5 = 5
grade_10 = 10
faminc = np.mean(X.faminc)
famsize = 5

In [12]:
print('The probability of attending college for a black female with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_5 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4] + results_p.params[5] + results_p.params[6]

The probability of attending college for a black female with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


2.223245729786343

In [13]:
print('The probability of attending college for a black female with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_10 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4] + results_p.params[5] + results_p.params[6]

The probability of attending college for a black female with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


1.708113898753711

# (d)

In [14]:
print('The probability of attending college for a white female with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_5 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4] + results_p.params[5]

The probability of attending college for a white female with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


1.7045116000119414

In [15]:
print('The probability of attending college for a white male with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_5 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4]

The probability of attending college for a white male with GRADES = 5, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


1.3862141265927783

In [16]:
print('The probability of attending college for a white female with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_10 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4] + results_p.params[5]

The probability of attending college for a white female with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


1.1893797689793093

In [17]:
print('The probability of attending college for a white male with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is ')
results_p.params[1]*grade_10 + results_p.params[2]*faminc + results_p.params[3]*famsize + results_p.params[4]

The probability of attending college for a white male with GRADES = 10, FAMINC = sample mean, from a household with five members, with a parent who attended college is 


0.8710822955601462

# (e)

In [18]:
X_new = data.drop(['psechoice', 'college', 'parcoll', 'black', 'female'], axis=1)
probit_new = sm.Probit(y, X_new)
results_p_new = probit_new.fit()

results_p_new.summary()

         Current function value: 0.487337
         Iterations: 35




0,1,2,3
Dep. Variable:,college,No. Observations:,1000.0
Model:,Probit,Df Residuals:,996.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 01 May 2019",Pseudo R-squ.:,0.0795
Time:,14:10:12,Log-Likelihood:,-487.34
converged:,False,LL-Null:,-529.43
,,LLR p-value:,3.889e-18

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
hscath,6.6495,3.28e+04,0.000,1.000,-6.42e+04,6.42e+04
grades,-0.0991,0.016,-6.311,0.000,-0.130,-0.068
faminc,0.0123,0.002,7.569,0.000,0.009,0.015
famsiz,0.1889,0.029,6.507,0.000,0.132,0.246
