In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import numpy as np
from sklearn.metrics import classification_report

from amb_sdk.sdk import DarwinSdk
import datetime
ts = '{:%Y%m%d%H%M%S}'.format(datetime.datetime.now())

In [2]:
# Login
from login import username, password
ds = DarwinSdk()
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user(username, password)

if not status:
    print(msg)
else:
    print('You are logged in!')

You are logged in!


In [3]:
def get_formatted_dataframe(xls_file, year):
    df = pd.read_excel(xls_file)
    df = df[['DZRATING', 'DZCAMPUS','DPETALLC','DA0AT' + str(year) + 'R', 'DA0912DR' + str(year) + 'R','DAGC4X' + str(year) + 'R','DAGC5X' + str(year - 1) + 'R','DA0GR' + str(year) + 'N','DA0CT' + str(year) + 'R','DA0CC' + str(year) + 'R','DA0CSA' + str(year) + 'R','DA0CAA' + str(year) + 'R','DPSTTOSA','DPSTEXPA','DPFRAALLT','DPFRAALLK','DPFRASTAP','DZRVLOCP','DPFRAFEDP','DPFEAINST','DPFEAINSK','DISTSIZE','COMMTYPE', 'PROPWLTH', 'TAXRATE']]
    df = df.rename(index=str, columns = {"DZRATING":"rating", "DZCAMPUS":"num_schools", "DPETALLC":"num_students", "DA0AT" + str(year) + "R":"attendance_rate", "DA0912DR" + str(year) + "R":"dropout_rate", "DAGC4X" + str(year) + "R":"grad_rate_4_year", "DAGC5X" + str(year - 1) + "R":"grad_rate_5_year", "DA0GR" + str(year) + "N":"annual_grad_count", "DA0CT" + str(year) + "R":"college_admissions_per_tested", "DA0CC" + str(year) + "R":"college_admissions_at_crit", "DA0CSA" + str(year) + "R":"average_sat", "DA0CAA" + str(year) + "R":"average_act", "DPSTTOSA":"average_teacher_salary", "DPSTEXPA":"average_teacher_exp", "DPFRAALLT":"total_revenue", "DPFRAALLK":"total_revenue_per_pupil", "DPFRASTAP":"percent_revenue_from_state", "DZRVLOCP":"percent_revenue_from_local", "DPFRAFEDP":"percent_revenue_from_federal", "DPFEAINST":"instr_expenditures", "DPFEAINSK":"instr_expenditures_per_pupil", "DISTSIZE":"district_size", "COMMTYPE":"community_type", "PROPWLTH":"property_wealth", "TAXRATE":"tax_rate"})
    return df

In [4]:
test_df = get_formatted_dataframe('2016-2017.xls', 16)
train_df = get_formatted_dataframe('2015-2016.xls',15)
train_df = train_df.append(get_formatted_dataframe('2014-2015.xls',14))
train_df = train_df.append(get_formatted_dataframe('2013-2014.xls',13))
train_df = train_df.loc[(train_df['rating'] == "Met Standard") | (train_df['rating'] == "Met Alternative Standard") |(train_df['rating'] == "Improvement Required")]
test_df = test_df.loc[(test_df['rating'] == "Met Standard") | (test_df['rating'] == "Met Alternative Standard") |(test_df['rating'] == "Improvement Required")]
print('Test dataframe shape:', test_df.shape)
print('Train dataframe shape:', train_df.shape)

Test dataframe shape: (1190, 25)
Train dataframe shape: (3618, 25)


In [15]:
#Dealing with class imbalance
print("Before upsampling:")
print("Met Standard:", train_df.loc[train_df['rating'] == "Met Standard"].shape[0])
print("Met Alternative Standard:", train_df.loc[train_df['rating'] == "Met Alternative Standard"].shape[0])
print("Improvement Required:", train_df.loc[train_df['rating'] == "Improvement Required"].shape[0])
print()

from sklearn.utils import resample
train_df_majority = train_df[train_df['rating'] == 'Met Standard']
train_df_minority_alternative = train_df[train_df['rating'] == 'Met Alternative Standard']
train_df_minority_improvement = train_df[train_df['rating'] == 'Improvement Required']
train_df_minority_alternative = resample(train_df_minority_alternative, replace=True, n_samples = 200, random_state=123)
train_df_minority_improvement = resample(train_df_minority_improvement, replace=True, n_samples = 500, random_state=123)
train_df_upsampled = pd.concat([train_df_majority, train_df_minority_alternative, train_df_minority_improvement])
train_df = train_df_upsampled

print("After upsampling:")
print("Met Standard:", train_df.loc[train_df['rating'] == "Met Standard"].shape[0])
print("Met Alternative Standard:", train_df.loc[train_df['rating'] == "Met Alternative Standard"].shape[0])
print("Improvement Required:", train_df.loc[train_df['rating'] == "Improvement Required"].shape[0])
train_df

Before upsampling:
Met Standard: 3300
Met Alternative Standard: 200
Improvement Required: 500

After upsampling:
Met Standard: 3300
Met Alternative Standard: 200
Improvement Required: 500


Unnamed: 0.1,Unnamed: 0,rating,num_schools,num_students,attendance_rate,dropout_rate,grad_rate_4_year,grad_rate_5_year,annual_grad_count,college_admissions_per_tested,...,total_revenue_per_pupil,percent_revenue_from_state,percent_revenue_from_local,percent_revenue_from_federal,instr_expenditures,instr_expenditures_per_pupil,district_size,community_type,property_wealth,tax_rate
0,0,Met Standard,3,568,96,1.4,93.3,89.2,56,66.1,...,14856,41.2,40.5,18.4,4699444,8145,500 to 999,Rural,"$434,690 to < $497,197",Under $1.1461
1,1,Met Standard,4,1244,96.5,0,95.8,98.7,73,68.5,...,10165,58.6,36.6,4.9,6045264,4967,"1,000 to 1,599",Non-metropolitan Stable,"$216,001 to < $241,215",$1.2543 to under $1.3801
2,2,Met Standard,3,841,95.9,0.4,97.7,100,43,32.6,...,11132,47.5,47.3,5.3,3788907,4718,500 to 999,Rural,"$319,192 to < $340,879",$1.3801 and over
3,3,Met Standard,3,383,95.5,1.6,90.9,97.6,29,34.5,...,11421,53.1,41.7,5.3,2067502,5588,Under 500,Rural,"$305,220 to < $319,192",$1.2543 to under $1.3801
4,4,Met Standard,6,3385,95.8,0.7,87.6,97.8,175,41.7,...,11450,41.8,44.7,13.5,17594931,5249,"3,000 to 4,999",Independent Town,"$277,857 to < $305,220",$1.3801 and over
5,5,Met Standard,4,1609,96,0.4,91.8,96.3,88,64.8,...,9302,53.7,38.3,8,8053949,5049,"1,600 to 2,999",Non-metropolitan Stable,"$257,557 to < $277,857",$1.1461 to under $1.2543
6,6,Met Standard,2,414,96.9,0,96.4,100,32,25,...,10601,57.3,37.7,5,2325824,5979,Under 500,Rural,"$277,857 to < $305,220",$1.1461 to under $1.2543
7,7,Met Standard,6,3968,95.7,0.3,96,95.3,255,38,...,12235,6.8,87.9,5.2,21843210,5453,"3,000 to 4,999",Other Central City Suburban,"$801,423 to < $24,507,828",$1.1461 to under $1.2543
8,8,Met Standard,2,966,96,0,100,.,40,55,...,8967,88.9,5,6,3909267,4820,500 to 999,Charters,Non-taxing entities,Non-taxing entities
9,9,Met Standard,5,2778,96.1,0.8,96.8,97.8,189,55,...,9719,64.4,26.7,8.9,13049980,4634,"1,600 to 2,999",Non-metropolitan Stable,"$109,324 to < $157,157",$1.2543 to under $1.3801


In [6]:
test_df.to_csv('test_data.csv')
train_df.to_csv('train_data.csv')

In [7]:
TRAIN_DATASET='train_data.csv'
TEST_DATASET = 'test_data.csv'
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')
status, message = ds.delete_dataset(TRAIN_DATASET)
status, dataset = ds.upload_dataset( TRAIN_DATASET)
if not status:
    print(dataset)

In [8]:
#clean train_Set
target = "rating"
status, job_id = ds.clean_data(TRAIN_DATASET, target = target)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-20T13:58:49.74703', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train_data.csv'], 'artifact_names': ['589297510d794b33a8d463ef620cd7ce'], 'model_name': None, 'job_error': None}
{'status': 'Complete', 'starttime': '2019-04-20T13:58:49.74703', 'endtime': '2019-04-20T13:58:53.089219', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train_data.csv'], 'artifact_names': ['589297510d794b33a8d463ef620cd7ce'], 'model_name': None, 'job_error': ''}


In [9]:
model = target + "_model01" + ts
status, job_id = ds.create_model(dataset_names = TRAIN_DATASET, \
                                 model_name =  model, \
                                 max_train_time = '00:05')
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-20T13:59:05.987623', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train_data.csv'], 'artifact_names': None, 'model_name': 'rating_model0120190420135845', 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-20T13:59:05.987623', 'endtime': None, 'percent_complete': 2, 'job_type': 'TrainModel', 'loss': 0.6029910445213318, 'generations': 3, 'dataset_names': ['train_data.csv'], 'artifact_names': None, 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-20T13:59:05.987623', 'endtime': None, 'percent_complete': 2, 'job_type': 'TrainModel', 'loss': 0.6029910445213318, 'generations': 3, 'dataset_names': ['train_data.csv'], 'artifact_names': None, 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-20T13:59:05.987623', 'endtime': None, 'percent_complete': 2, 'j

In [10]:
# Retrieve feature importance of built model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])


{'status': 'Running', 'starttime': '2019-04-20T14:04:23.732893', 'endtime': None, 'percent_complete': 0, 'job_type': 'AnalyzeModel', 'loss': 0.22382672503590584, 'generations': 4, 'dataset_names': None, 'artifact_names': ['2a440bb7eaf64a59894249122241c833'], 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-20T14:04:23.732893', 'endtime': None, 'percent_complete': 0, 'job_type': 'AnalyzeModel', 'loss': 0.22382672503590584, 'generations': 4, 'dataset_names': None, 'artifact_names': ['2a440bb7eaf64a59894249122241c833'], 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-20T14:04:23.732893', 'endtime': None, 'percent_complete': 0, 'job_type': 'AnalyzeModel', 'loss': 0.22382672503590584, 'generations': 4, 'dataset_names': None, 'artifact_names': ['2a440bb7eaf64a59894249122241c833'], 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Running', 'starttime':

In [11]:
feature_importance[:10]

'404: NOT F'

In [12]:
status, artifact = ds.run_model(TRAIN_DATASET, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

{'status': 'Running', 'starttime': '2019-04-20T14:06:26.830256', 'endtime': None, 'percent_complete': 0, 'job_type': 'RunModel', 'loss': 0.22382672503590584, 'generations': 4, 'dataset_names': ['train_data.csv'], 'artifact_names': ['314e8ea90f184a838abcb54a6dd037f2'], 'model_name': 'rating_model0120190420135845', 'job_error': ''}
{'status': 'Failed', 'starttime': '2019-04-20T14:06:26.830256', 'endtime': '2019-04-20T14:06:42.557429', 'percent_complete': 100, 'job_type': 'RunModel', 'loss': 0.22382672503590584, 'generations': 4, 'dataset_names': ['train_data.csv'], 'artifact_names': ['314e8ea90f184a838abcb54a6dd037f2'], 'model_name': 'rating_model0120190420135845', 'job_error': 'FailedDownload: Failed to download entity of type Object from location models/f9f9647a-4f34-11e9-ba76-b77429c9fe8d_rating_model0120190420135845/skl_model/amb-checkpoint-best-genome'}


(False,
 {'artifact_names': ['314e8ea90f184a838abcb54a6dd037f2'],
  'dataset_names': ['train_data.csv'],
  'endtime': '2019-04-20T14:06:42.557429',
  'generations': 4,
  'job_error': 'FailedDownload: Failed to download entity of type Object from location models/f9f9647a-4f34-11e9-ba76-b77429c9fe8d_rating_model0120190420135845/skl_model/amb-checkpoint-best-genome',
  'job_type': 'RunModel',
  'loss': 0.22382672503590584,
  'model_name': 'rating_model0120190420135845',
  'percent_complete': 100,
  'starttime': '2019-04-20T14:06:26.830256',
  'status': 'Failed'})

In [13]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
prediction.head()

AttributeError: 'str' object has no attribute 'head'

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(test_df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(test_df[target], prediction[target]))

In [None]:
#TEST DATASET
status, message = ds.delete_dataset(TEST_DATASET)
status, dataset = ds.upload_dataset( TEST_DATASET)
if not status:
    print(dataset)

In [None]:
status, job_id = ds.clean_data(TEST_DATASET, target = target, model_name = model)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
status, artifact = ds.run_model(TEST_DATASET, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

In [None]:
# Create plots comparing predictions with actual target
status, prediction = ds.download_artifact(artifact['artifact_name'])
df = pd.read_csv(TEST_DATASET)
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))