In [1]:
# 
# Copyright (c) 2022 Go2Market Insights d/b/a Analyzr
# All rights reserved
# https://analyzr.ai
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial portions
# of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# 
# For Python SDK reference go to https://https://analyzr-sdk-python.readthedocs.io/
# For support go to https://support.analyzr.ai
# 
import pandas as pd 
import datetime

In [2]:
# 
# START ANALYZR CLIENT
# 
# Access to the Analyzr API requires credentials. Contact your account manager or contact 
# our support team at https://support.analyzr.ai for more info. If you are a free tier user 
# you can skip this step altogether and use your local compute resources. 
# 
# For installation instructions on the Analyzr client see https://github.com/analyzr-ai/analyzr-sdk-python
# 
from analyzrclient import Analyzer
analyzer = Analyzer(host='analyzr3.api.g2m.ai')
analyzer.login()
analyzer.version()

Login successful


{'api': {'status': 200, 'version': 'v1.7.63', 'tenant': 'Analyzr3'},
 'client': {'version': '1.3.17'},
 'copyright': '2023 (c) Go2Market Insights Inc. All rights reserved. Patent pending. '}

In [5]:
# 
# LOAD DATA
# 
data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False, chunksize=100)
raw_data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False)

In [6]:
# 
# TAKE A SUBSET OF THE ORIGINAL DATASET
# 
# This step selects a subset of the original dataset for training purposes. 
# It also assigns a record identifier field (ID_FIELD) for audit and reconciliation 
# purposes. 
# 
SELECTED_FIELDS = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked',
    'PassengerId',
]
ID_FIELD = 'PassengerId'
df = raw_data[SELECTED_FIELDS].dropna()
df[ID_FIELD] = df[ID_FIELD].astype('string')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PassengerId
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,2
2,1,3,female,26.0,0,0,7.925,S,3
3,1,1,female,35.0,1,0,53.1,S,4
4,0,3,male,35.0,0,0,8.05,S,5


In [7]:
# 
# ASSIGN VARIABLE TYPES
# 
# This step assigns variable types: categorical or numerical. 
# 
CATEGORICAL_VARS = ['Sex', 'Embarked'] 
NUMERICAL_VARS = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

In [9]:
resID = ''
t_0 = datetime.datetime.now()
for i, chunk in enumerate(data): 
  chunk = chunk[SELECTED_FIELDS].dropna()
  chunk[ID_FIELD] = chunk[ID_FIELD].astype('float')  
  #
  # TRAIN MODEL
  #
  # This step performs model training and cross-validation. If you do not have API access, e.g. free 
  # tier users, you will need to replace the training command with your own model.fit() statement 
  # for model training and cross-validation using local compute resources.  
  # 
  # Note also this version keeps polling the API while the model is training. For longer training runs 
  # you may want to disable polling and manually check the status of your model (see next cell). 
  # 
  client_id = 'test'
  algorithm = 'minibatch-kmeans'
  N = 5
  verbose = True
  res = analyzer.cluster.run(
      chunk, client_id=client_id, 
      idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
      algorithm=algorithm, n_components=N, 
      buffer_batch_size=10, verbose=True, poll=True, compressed=True, staging=True, out_of_core=True
  )
  resID = res['request_id']
print('model_id: {}'.format(res['request_id']))
print('total time: {}'.format(datetime.datetime.now()-t_0))

TypeError: run() got an unexpected keyword argument 'out_of_core'

In [8]:
print(res)

{'data':      CRS_DEP_TIME  TAXI_OUT  WHEELS_OFF  WHEELS_ON  TAXI_IN  CRS_ARR_TIME  \
0              10       9.0      1155.0     1302.0     12.0            12   
1               7      11.0       732.0      942.0      9.0             9   
2              10      17.0      1036.0     1435.0      5.0            14   
3               9      11.0       909.0     1001.0     15.0            10   
4              14      14.0      1434.0     1603.0      8.0            16   
..            ...       ...         ...        ...      ...           ...   
840            22       7.0      2251.0     2312.0      8.0            23   
841            16       9.0      1659.0     1850.0      8.0            19   
842            17      14.0      1725.0     2031.0     10.0            20   
843            12      10.0      1240.0     1500.0     15.0            15   
844             9      60.0       955.0     1024.0      9.0            10   

     DISTANCE ORIGIN_STATE_ABR DEST_STATE_ABR UNIQUE_CARRIER      