In [1]:
# 
# Copyright (c) 2022 Go2Market Insights d/b/a Analyzr
# All rights reserved
# https://analyzr.ai
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial portions
# of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# 
# For Python SDK reference go to https://https://analyzr-sdk-python.readthedocs.io/
# For support go to https://support.analyzr.ai
# 
import pandas as pd 
import datetime
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
# 
# START ANALYZR CLIENT
# 
# Access to the Analyzr API requires credentials. Contact your account manager or contact 
# our support team at https://support.analyzr.ai for more info. If you are a free tier user 
# you can skip this step altogether and use your local compute resources. 
# 
# For installation instructions on the Analyzr client see https://github.com/analyzr-ai/analyzr-sdk-python
# 
from analyzrclient import Analyzer
analyzer = Analyzer(host='insert_your_host')
analyzer.login()
analyzer.version()

Login successful


{'api': {'status': 200, 'version': 'v1.8.12', 'tenant': 'Analyzr3'},
 'client': {'version': '1.3.21'},
 'copyright': '2023 (c) Go2Market Insights Inc. All rights reserved. Patent pending. '}

In [3]:
# 
# LOAD DATA
# 
raw_data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False)
raw_data.head()

In [5]:
# 
# TAKE A SUBSET OF THE ORIGINAL DATASET
# 
# This step selects a subset of the original dataset for training purposes. 
# It also assigns a record identifier field (ID_FIELD) for audit and reconciliation 
# purposes. 
# 
SELECTED_FIELDS = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked',
    'PassengerId',
]
ID_FIELD = 'PassengerId'
df = raw_data[SELECTED_FIELDS].dropna()
df[ID_FIELD] = df[ID_FIELD].astype('string')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PassengerId
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,2
2,1,3,female,26.0,0,0,7.925,S,3
3,1,1,female,35.0,1,0,53.1,S,4
4,0,3,male,35.0,0,0,8.05,S,5


In [6]:
# 
# ASSIGN VARIABLE TYPES
# 
# This step assigns variable types: categorical or numerical. 
# 
CATEGORICAL_VARS = ['Sex', 'Embarked'] 
NUMERICAL_VARS = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

In [7]:
#
# TRAIN MODEL
#
# This step performs model training and cross-validation. If you do not have API access, e.g. free 
# tier users, you will need to replace the training command with your own model.fit() statement 
# for model training and cross-validation using local compute resources.  
# 
# Note also this version keeps polling the API while the model is training. For longer training runs 
# you may want to disable polling and manually check the status of your model (see next cell). 
# 
data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False, chunksize=250)
t_0 = datetime.datetime.now()
previous_id = None
client_id = 'insert_your_client_id'
algorithm = 'minibatch-kmeans'
N = 3
verbose = True
for i, chunk in enumerate(data): 
  chunk = chunk[SELECTED_FIELDS].dropna()  

  if previous_id is not None: 
    res = analyzer.cluster.run(
        chunk, client_id=client_id, request_id=previous_id,
        idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
        algorithm=algorithm, n_components=N, 
        buffer_batch_size=50, verbose=True, poll=True, compressed=True, staging=True, out_of_core=True
    )
  else: 
    res = analyzer.cluster.run(
        chunk, client_id=client_id, request_id=None,
        idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
        algorithm=algorithm, n_components=N, 
        buffer_batch_size=200, verbose=True, poll=True, compressed=True, staging=True, out_of_core=True
    )
  if previous_id is None: previous_id = res['request_id']

print('model_id: {}'.format(res['request_id']))
print('total time: {}'.format(datetime.datetime.now()-t_0))

Request ID: 24994912-3405-4287-9312-d5c0b4949057
Loading encoding keys...
ERROR! Keys not found for model_id: 24994912-3405-4287-9312-d5c0b4949057
Encoding categorical variables:
	Sex
	Embarked
Encoding numerical variables:
	Pclass
	Age
	SibSp
	Parch
	Fare
Encoding record IDs...
Encoding field names...
Saving encoding keys locally...
Saving data to buffer...
        Processed batch 2 of 2
Clustering data in buffer...
[_poll][0] {'status': 200, 'response': {'request_id': '24994912-3405-4287-9312-d5c0b4949057', 'status': 'Complete', 'details': 'N/A'}}
Decoding field names...
Decoding categorical variables:
Decoding numerical variables:
Decoding record IDs...
Clearing buffer...
Request ID: 24994912-3405-4287-9312-d5c0b4949057
Loading encoding keys...
Encoding categorical variables:
	Sex
	Embarked
Encoding numerical variables:
	Pclass
	Age
	SibSp
	Parch
	Fare
Encoding record IDs...
Encoding field names...
Saving encoding keys locally...
Saving data to buffer...
        Processed batch 4 of

KeyError: 'f5b13cd8-3022-49ec-a7bc-41408cf37741'

In [None]:
res