In [271]:
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

conn = create_engine('postgresql://ubuntu@52.53.236.232:5432/collegesc')

%matplotlib inline

## Helper functions and other definitions

In [54]:
def query_data_def(conn, var_name):
    """
    var_name: name of column to query
    returns query from datadefinitions table showing definition of data
    """
    query = "SELECT * FROM datadefinitions WHERE developername='"+var_name+"';"
    return conn.execute(query).fetchall()

## Start with school table

In [211]:
school_all = """SELECT * FROM school;"""

In [212]:
school_df = pd.read_sql_query(school_all, conn)

In [213]:
school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 43 columns):
index                                   7703 non-null int64
id                                      7703 non-null int64
location.lat                            7703 non-null float64
location.lon                            7703 non-null float64
accreditor                              6805 non-null object
alias                                   2178 non-null object
branches                                7703 non-null int64
carnegie_basic                          7282 non-null float64
carnegie_size_setting                   7282 non-null float64
carnegie_undergrad                      7282 non-null float64
city                                    7703 non-null object
degree_urbanization                     0 non-null object
degrees_awarded.highest                 7703 non-null int64
degrees_awarded.predominant             7703 non-null int64
degrees_awarded.predominant_recoded     0 

In [214]:
school_df.under_investigation.value_counts()

0    7467
1     236
Name: under_investigation, dtype: int64

### Remove uninteresting columns or columns with explicit or encoded nulls

In [215]:
school_columns = list(school_df.columns.values)

In [216]:
school_columns.remove('index')
school_columns.remove('alias')
school_columns.remove('degree_urbanization')
school_columns.remove('degrees_awarded.predominant_recoded')
school_columns.remove('faculty_salary')
school_columns.remove('ft_faculty_rate')
school_columns.remove('price_calculator_url')
school_columns.remove('url')
school_columns.remove('zip')
school_columns.remove('carnegie_basic')
school_columns.remove('carnegie_size_setting')
school_columns.remove('carnegie_undergrad')
school_columns.remove('online_only')

## Select only interesting features and add/recode features

In [217]:
operating_query = 'SELECT \n"' + '","'.join(school_columns) + '"'
operating_query += """                       
                    FROM 
                        school"""

In [218]:
school_df = pd.read_sql_query(operating_query, conn)

In [219]:
school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 30 columns):
id                                      7703 non-null int64
location.lat                            7703 non-null float64
location.lon                            7703 non-null float64
accreditor                              6805 non-null object
branches                                7703 non-null int64
city                                    7703 non-null object
degrees_awarded.highest                 7703 non-null int64
degrees_awarded.predominant             7703 non-null int64
institutional_characteristics.level     7703 non-null int64
instructional_expenditure_per_fte       7270 non-null float64
locale                                  7282 non-null float64
main_campus                             7703 non-null int64
men_only                                7282 non-null float64
minority_serving.aanipi                 7282 non-null float64
minority_serving.annh                  

### Add whether college is accredited or not

In [220]:
school_df['accredited'] = school_df.accreditor.map(lambda x: 1 if x else 0)

In [221]:
school_df.accredited.value_counts()

1    6805
0     898
Name: accredited, dtype: int64

### Recode locale to 4 values instead of 12

In [222]:
locale_recode = school_df.locale.astype(str).values

In [223]:
locale_recode = [int(x[0]) if x[0] in ['1','2','3'] else 0 for x in locale_recode]

In [224]:
school_df['locale_recode'] = locale_recode

In [225]:
school_df['locale_recode'].value_counts()

1    3509
2    2291
0     986
3     917
Name: locale_recode, dtype: int64

### Recode religion to religious and non-religious

In [226]:
religion_recode = (np.nan_to_num(school_df.religious_affiliation.values) > 0).astype(int)

In [227]:
np.unique(religion_recode)

array([0, 1])

In [228]:
school_df['religion_recode'] = religion_recode

In [229]:
school_df.religion_recode.value_counts()

0    6793
1     910
Name: religion_recode, dtype: int64

### Check other variables

In [197]:
for column in school_df.columns:
    print(column+" value counts")
    print(school_df[column].value_counts())

id value counts
217086      1
216667      1
177588      1
449977      1
179645      1
236993      1
476610      1
107974      1
171465      1
150987      1
130509      1
169424      1
155089      1
156417      1
226772      1
452054      1
191959      1
161244      1
213491      1
480736      1
175591      1
417257      1
181738      1
194028      1
409069      1
372329      1
431600      1
230898      1
152336      1
478643      1
           ..
23368407    1
197285      1
449214      1
187046      1
189097      1
203438      1
418481      1
190044      1
139393      1
201399      1
261773      1
467641      1
457402      1
461500      1
459453      1
476902      1
127699      1
230852      1
213215      1
209603      1
182980      1
16947910    1
447175      1
174792      1
205513      1
228042      1
406219      1
16947918    1
234191      1
149505      1
Name: id, Length: 7703, dtype: int64
location.lat value counts
0.000000     421
40.760050      2
38.655362      2
32.955170      2

### Recode gender-specific schools

In [230]:
school_df['gender_specific'] = np.nan_to_num((school_df.men_only + school_df.women_only).values)

In [231]:
school_df.gender_specific.value_counts()

0.0    7598
1.0     105
Name: gender_specific, dtype: int64

### Recode minority-serving schools

In [232]:
minority_specific = np.nan_to_num((school_df['minority_serving.aanipi'] +\
                        school_df['minority_serving.annh'] +\
                        school_df['minority_serving.hispanic'] +\
                        school_df['minority_serving.historically_black'] +\
                        school_df['minority_serving.nant'] +\
                        school_df['minority_serving.predominantly_black'] +\
                        school_df['minority_serving.tribal']).values)

In [233]:
minority_specific = (minority_specific > 0).astype(int)

In [234]:
school_df['minority_specific'] = minority_specific

### Recode for-profit vs. non-profit (private or public) for initial classification

In [247]:
school_df['for_profit'] = (school_df.ownership == 3).astype(int)

In [250]:
school_df.for_profit.value_counts()

0    4000
1    3703
Name: for_profit, dtype: int64

In [251]:
school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 36 columns):
id                                      7703 non-null int64
location.lat                            7703 non-null float64
location.lon                            7703 non-null float64
accreditor                              6805 non-null object
branches                                7703 non-null int64
city                                    7703 non-null object
degrees_awarded.highest                 7703 non-null int64
degrees_awarded.predominant             7703 non-null int64
institutional_characteristics.level     7703 non-null int64
instructional_expenditure_per_fte       7270 non-null float64
locale                                  7282 non-null float64
main_campus                             7703 non-null int64
men_only                                7282 non-null float64
minority_serving.aanipi                 7282 non-null float64
minority_serving.annh                  

### Pare down to relevant columns and remove nulls

In [253]:
cleaned_school_df = school_df.drop(['accreditor', 'locale', 'men_only',
                                    'minority_serving.aanipi', 'minority_serving.annh',
                                    'minority_serving.hispanic', 'minority_serving.historically_black',
                                    'minority_serving.nant', 'minority_serving.predominantly_black',
                                    'minority_serving.tribal', 'religious_affiliation', 'state', 
                                    'state_fips','women_only'], axis = 1)

In [256]:
cleaned_school_df = cleaned_school_df.loc[cleaned_school_df.tuition_revenue_per_fte.notnull()]

In [257]:
cleaned_school_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7270 entries, 0 to 7702
Data columns (total 22 columns):
id                                     7270 non-null int64
location.lat                           7270 non-null float64
location.lon                           7270 non-null float64
branches                               7270 non-null int64
city                                   7270 non-null object
degrees_awarded.highest                7270 non-null int64
degrees_awarded.predominant            7270 non-null int64
institutional_characteristics.level    7270 non-null int64
instructional_expenditure_per_fte      7270 non-null float64
main_campus                            7270 non-null int64
name                                   7270 non-null object
operating                              7270 non-null int64
ownership                              7270 non-null int64
region_id                              7270 non-null int64
tuition_revenue_per_fte                7270 non-null float6

## Select columns for classification


In [259]:
X = cleaned_school_df.drop(['id', 'city', 'name', 'ownership', 'region_id', 'for_profit'], axis = 1)
y = cleaned_school_df.for_profit

In [262]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [263]:
log_model = LogisticRegression(penalty='l1')

In [264]:
log_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [269]:
pred_ownership = log_model.predict(X_test)

In [270]:
accuracy_score(y_test, pred_ownership)

0.91746905089408526

In [272]:
precision_score(y_test, pred_ownership)

0.89914772727272729

In [273]:
recall_score(y_test, pred_ownership)

0.92815249266862165

In [274]:
f1_score(y_test, pred_ownership)

0.91341991341991347

In [276]:
list(zip(X.columns, log_model.coef_[0]))

[('location.lat', -0.056715394249809988),
 ('location.lon', -0.018223139969331177),
 ('branches', 0.10757523176757297),
 ('degrees_awarded.highest', -0.45152923649317228),
 ('degrees_awarded.predominant', -0.99718814418267798),
 ('institutional_characteristics.level', 0.6739526444619196),
 ('instructional_expenditure_per_fte', -0.00037524157493103427),
 ('main_campus', -0.57040602366387405),
 ('operating', -2.3910830362947517),
 ('tuition_revenue_per_fte', 0.00027740487284229592),
 ('under_investigation', 1.2372291616364712),
 ('accredited', 0.91363119510898316),
 ('locale_recode', 0.063109823159900727),
 ('religion_recode', -5.5769982494035464),
 ('gender_specific', -0.13043050134489137),
 ('minority_specific', -5.948137078137365)]