In [4]:
# 
# Copyright (c) 2022 Go2Market Insights d/b/a Analyzr
# All rights reserved
# https://analyzr.ai
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial portions
# of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# 
# For Python SDK reference go to https://https://analyzr-sdk-python.readthedocs.io/
# For support go to https://support.analyzr.ai
# 
import pandas as pd 
import datetime

In [5]:
# 
# LOAD DATA
# 
data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/public_datasets/flight_weather.csv', encoding = "ISO-8859-1", low_memory=False)

In [7]:
# 
# TAKE A SUBSET OF THE ORIGINAL DATASET
# 
# This step selects a subset of the original dataset for training purposes. 
# It also assigns a record identifier field (ID_FIELD) for audit and reconciliation 
# purposes. 
# 
SELECTED_FIELDS = [
    'CRS_DEP_TIME', 
    'TAXI_OUT', 
    'WHEELS_OFF', 
    'WHEELS_ON', 
    'TAXI_IN', 
    'CRS_ARR_TIME', 
    'DISTANCE',
    # 'ORIGIN_STATE_ABR', 
    # 'DEST_STATE_ABR', 
    # 'UNIQUE_CARRIER',
    'X.1',
]
ID_FIELD = 'X.1'
df = data[SELECTED_FIELDS].dropna()
df[ID_FIELD] = df[ID_FIELD].astype('string')

In [8]:
df.head(-10)

Unnamed: 0,CRS_DEP_TIME,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,DISTANCE,X.1
0,9,18.0,922.0,1131.0,20.0,12,2475,1
1,9,21.0,919.0,1212.0,15.0,12,2475,2
2,9,18.0,943.0,1741.0,13.0,18,2475,3
4,12,14.0,1205.0,1436.0,7.0,15,2475,5
5,12,23.0,1217.0,1455.0,13.0,15,2475,6
...,...,...,...,...,...,...,...,...
1900860,9,13.0,944.0,1021.0,9.0,10,216,1900861
1900861,8,11.0,844.0,930.0,7.0,9,236,1900862
1900862,19,17.0,2012.0,2130.0,6.0,21,413,1900863
1900863,14,10.0,1405.0,1428.0,12.0,14,100,1900864


In [9]:
# 
# ASSIGN VARIABLE TYPES
# 
# This step assigns variable types: categorical or numerical. 
# 'CRS_DEP_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'DISTANCE'
CATEGORICAL_VARS = [
                  ] 
NUMERICAL_VARS = ['CRS_DEP_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'DISTANCE'] 

In [6]:
# 
# START ANALYZR CLIENT
# 
# Access to the Analyzr API requires credentials. Contact your account manager or contact 
# our support team at https://support.analyzr.ai for more info. If you are a free tier user 
# you can skip this step altogether and use your local compute resources. 
# 
# For installation instructions on the Analyzr client see https://github.com/analyzr-ai/analyzr-sdk-python
# 
from analyzrclient import Analyzer
analyzer = Analyzer(host='analyzr3.api.g2m.ai')
analyzer.login()
analyzer.version()

Login successful


{'api': {'status': 200, 'version': 'v1.7.57', 'tenant': 'Analyzr3'},
 'client': {'version': '1.3.17'},
 'copyright': '2023 (c) Go2Market Insights Inc. All rights reserved. Patent pending. '}

In [7]:
df = df.sample(850000)

In [13]:
#
# TRAIN MODEL
#
# This step performs model training and cross-validation. If you do not have API access, e.g. free 
# tier users, you will need to replace the training command with your own model.fit() statement 
# for model training and cross-validation using local compute resources.  
# 
# Note also this version keeps polling the API while the model is training. For longer training runs 
# you may want to disable polling and manually check the status of your model (see next cell). 
# 
t_0 = datetime.datetime.now()
client_id = 'test'
algorithm = 'pca-kmeans-simple'
N = 5
verbose = True
res = analyzer.cluster.run(
    df, client_id=client_id, 
    idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
    algorithm=algorithm, n_components=N, 
    buffer_batch_size=50000, verbose=True, poll=True, compressed=True, staging=True, 
)
print('model_id: {}'.format(res['request_id']))
print('total time: {}'.format(datetime.datetime.now()-t_0))

Request ID: 3900ffd0-79c5-4469-8b57-057ed9617013
Encoding categorical variables:
	ORIGIN_STATE_ABR
	DEST_STATE_ABR
	UNIQUE_CARRIER
Encoding numerical variables:
	CRS_DEP_TIME
	TAXI_OUT
	WHEELS_OFF
	WHEELS_ON
	TAXI_IN
	CRS_ARR_TIME
	DISTANCE
Encoding record IDs...
Encoding field names...
Saving encoding keys locally...
Saving data to buffer...
        Processed batch 17 of 17
Clustering data in buffer...
[_poll][598] {'status': 200, 'response': {'request_id': '3900ffd0-79c5-4469-8b57-057ed9617013', 'status': 'Pending', 'details': 'Clustering training in progress...'}}
Clearing buffer...


UnboundLocalError: local variable 'df2' referenced before assignment

In [None]:
res2 = analyzer.cluster.check_status('6b5cfcfd-2857-4159-8556-40b3ea4782f2','test',True)
print(res2)

{'request_id': '0eaa2009-cea7-4f39-aa5f-0cc9bcc08482', 'status': 'Pending'}


In [20]:
#
# SHOW DATA KEYED BY CLUSTER
#
res['data']

Unnamed: 0,CRS_DEP_TIME,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,DISTANCE,UNIQUE_CARRIER,X.1,PC_ID
0,9,18.0,922.0,1131.0,20.0,12,2475,AA,1,2
1,9,21.0,919.0,1212.0,15.0,12,2475,AA,2,2
2,9,18.0,943.0,1741.0,13.0,18,2475,AA,3,2
3,12,14.0,1205.0,1436.0,7.0,15,2475,AA,5,2
4,12,23.0,1217.0,1455.0,13.0,15,2475,AA,6,2
...,...,...,...,...,...,...,...,...,...,...
1846677,22,7.0,2251.0,2312.0,8.0,23,110,YV,1900871,4
1846678,16,9.0,1659.0,1850.0,8.0,19,261,YV,1900872,3
1846679,17,14.0,1725.0,2031.0,10.0,20,1095,YV,1900873,4
1846680,12,10.0,1240.0,1500.0,15.0,15,509,YV,1900874,3


In [20]:
#
# SHOW STATS BY CLUSTER
#
res['stats']

PC_ID,0,1,2,3,4
count,156737.000000,211395.000000,193958.000000,169985.000000,17925.000000
frequency,0.208983,0.281860,0.258611,0.226647,0.023900
CRS_DEP_TIME,15.769231,17.012720,9.127069,9.905345,12.949902
TAXI_OUT,16.796819,13.642513,13.425793,19.163850,11.768257
WHEELS_OFF,1645.428112,1767.660006,959.989395,1037.671889,1333.002622
...,...,...,...,...,...
UNIQUE_CARRIER_US,0.018107,0.085901,0.018133,0.167727,0.000000
UNIQUE_CARRIER_VX,0.000000,0.008931,0.003114,0.004142,0.000000
UNIQUE_CARRIER_WN,0.021125,0.319000,0.338692,0.035009,0.000000
UNIQUE_CARRIER_XE,0.078871,0.002096,0.035626,0.011342,0.000000


In [21]:
#
# SHOW DISTANCES BETWEEN CLUSTER CENTROIDS
#
res['distances']

Unnamed: 0,0,1,2,3,4
0,0.0,388.489428,958.264824,971.623401,508.738575
1,388.489428,0.0,1185.546437,1050.0892,667.195777
2,958.264824,1185.546437,0.0,439.609193,523.162246
3,971.623401,1050.0892,439.609193,0.0,467.490646
4,508.738575,667.195777,523.162246,467.490646,0.0
