In [28]:
# 
# Copyright (c) 2022 Go2Market Insights d/b/a Analyzr
# All rights reserved
# https://analyzr.ai
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial portions
# of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# 
# For Python SDK reference go to https://https://analyzr-sdk-python.readthedocs.io/
# For support go to https://support.analyzr.ai
# 
import pandas as pd 
import datetime
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [29]:
# 
# START ANALYZR CLIENT
# 
# Access to the Analyzr API requires credentials. Contact your account manager or contact 
# our support team at https://support.analyzr.ai for more info. If you are a free tier user 
# you can skip this step altogether and use your local compute resources. 
# 
# For installation instructions on the Analyzr client see https://github.com/analyzr-ai/analyzr-sdk-python
# 
from analyzrclient import Analyzer
analyzer = Analyzer(host='analyzr3.api.g2m.ai')
analyzer.login()
analyzer.version()

Login successful


{'api': {'status': 200, 'version': 'v1.7.76', 'tenant': 'Analyzr3'},
 'client': {'version': '1.2.61'},
 'copyright': '2023 (c) Go2Market Insights Inc. All rights reserved. Patent pending. '}

In [30]:
# 
# LOAD DATA
# 
data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False, chunksize=200)
raw_data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False)

In [31]:
# 
# TAKE A SUBSET OF THE ORIGINAL DATASET
# 
# This step selects a subset of the original dataset for training purposes. 
# It also assigns a record identifier field (ID_FIELD) for audit and reconciliation 
# purposes. 
# 
SELECTED_FIELDS = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked',
    'PassengerId',
]
ID_FIELD = 'PassengerId'
df = raw_data[SELECTED_FIELDS].dropna()
df[ID_FIELD] = df[ID_FIELD].astype('string')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PassengerId
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,2
2,1,3,female,26.0,0,0,7.925,S,3
3,1,1,female,35.0,1,0,53.1,S,4
4,0,3,male,35.0,0,0,8.05,S,5


In [32]:
# 
# ASSIGN VARIABLE TYPES
# 
# This step assigns variable types: categorical or numerical. 
# 
CATEGORICAL_VARS = ['Sex', 'Embarked'] 
NUMERICAL_VARS = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

In [33]:
def pre_process_common(data=None, idx_field=None, categorical_fields=[], time_field=None, saturation_fields=[], carryover_fields=[], scale=False, sort_records=False):
    """
    :param data:
    :param idx_field:
    :param categorical_fields:
    :param time_field:
    :param saturation_fields:
    :param carryover_fields:
    :param scale:
    :param sort_records:
    :return df:
    """

    # Convert to DataFrame
    dataset = pd.DataFrame(data)
    details = []
    # Check for NaN
    # dataset = dataset.replace('', None)
    nans = dataset[dataset.isnull().any(axis=1)]
    if len(nans)>0:
        dataset = dataset.dropna()

    # Force type to float for non-categorical fields
    for col in dataset.columns:
        if col not in [idx_field, *categorical_fields]:
            # dataset[col] = dataset[col].astype('float')
            dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

    # Check for NaN again
    nans = dataset[dataset.isnull().any(axis=1)]
    if len(nans)>0:
        dataset = dataset.dropna()

    # Scale non-categorical fields if requested
    if scale is True and dataset.empty is False:
        noncat_cols = [col for col in dataset.columns if col not in [idx_field, *categorical_fields]]
        dataset[noncat_cols] = StandardScaler().fit_transform(dataset[noncat_cols])

    # Generate dummy variables for categorical fields
    if dataset is not None and dataset.empty is False:
        df = pd.get_dummies(dataset, columns=categorical_fields)
    else:
        msg = 'Cannot convert empty dataset'
        details.append(msg)
        df = pd.DataFrame()

    # Process time field
    if sort_records is True:
        if time_field is not None:
            df.sort_values([time_field], inplace=True, ascending=True)
        elif idx_field is not None:
            df.sort_values([idx_field], inplace=True, ascending=True)
        else:
            pass

    # Process saturation fields
    pass 

    # Process carryover fields
    pass

    return df

In [38]:
data = pd.read_csv('https://g2mstaticfiles.blob.core.windows.net/$web/titanic.csv', encoding = "ISO-8859-1", low_memory=False, chunksize=250)
t_0 = datetime.datetime.now()
total_clusters = pd.DataFrame()
scores = pd.DataFrame()
previous_res = None
for i, chunk in enumerate(data): 
  chunk = chunk[SELECTED_FIELDS].dropna()
  chunk[ID_FIELD] = chunk[ID_FIELD].astype('float')  
  #
  # TRAIN MODEL
  #
  # This step performs model training and cross-validation. If you do not have API access, e.g. free 
  # tier users, you will need to replace the training command with your own model.fit() statement 
  # for model training and cross-validation using local compute resources.  
  # 
  # Note also this version keeps polling the API while the model is training. For longer training runs 
  # you may want to disable polling and manually check the status of your model (see next cell). 
  # 
  client_id = 'test'
  algorithm = 'minibatch-kmeans'
  N = 5
  verbose = True
  if previous_res is not None: 
    res = analyzer.cluster.run(
        chunk, client_id=client_id,
        idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
        algorithm=algorithm, n_components=N, 
        buffer_batch_size=50, verbose=True, poll=True, compressed=True, staging=True, out_of_core=True
    )
  else: 
    res = analyzer.cluster.run(
        chunk, client_id=client_id,
        idx_var=ID_FIELD, categorical_vars=CATEGORICAL_VARS, numerical_vars=NUMERICAL_VARS, 
        algorithm=algorithm, n_components=N, 
        buffer_batch_size=200, verbose=True, poll=True, compressed=True, staging=True, out_of_core=True
    )
  data2 = res['data']
  clusters = data2['PC_ID']
  total_clusters = pd.concat([total_clusters, clusters], axis=0)
  previous_res = res['request_id']
  scaled_X_two = pre_process_common(data=chunk, idx_field=ID_FIELD, categorical_fields=CATEGORICAL_VARS)
  scaled_X_two = scaled_X_two.drop(columns=ID_FIELD)  
  score2 = pd.DataFrame([silhouette_score(scaled_X_two, clusters)])
  scores = scores.append(score2)

scaled_X = pre_process_common(data=df, idx_field=ID_FIELD, categorical_fields=CATEGORICAL_VARS)
scaled_X = scaled_X.drop(columns=ID_FIELD)
print(total_clusters)
score = silhouette_score(scaled_X, total_clusters)
print('model_id: {}'.format(res['request_id']))
print('total time: {}'.format(datetime.datetime.now()-t_0))

Request ID: ebde551d-b2be-43d0-bd55-38aa38dafeff
Encoding categorical variables:
	Sex
	Embarked
Encoding numerical variables:
	Pclass
	Age
	SibSp
	Parch
	Fare
Encoding record IDs...
Encoding field names...
Saving encoding keys locally...
Saving data to buffer...
        Processed batch 2 of 2
Clustering data in buffer...
[_poll][0] {'status': 200, 'response': {'request_id': 'ebde551d-b2be-43d0-bd55-38aa38dafeff', 'status': 'Complete', 'details': 'N/A'}}
Decoding field names...
Decoding categorical variables:
Decoding numerical variables:
Decoding record IDs...
Clearing buffer...
Request ID: b65b12fb-c3db-461b-a33b-4c634b7ab22f
Encoding categorical variables:
	Sex
	Embarked
Encoding numerical variables:
	Pclass
	Age
	SibSp
	Parch
	Fare
Encoding record IDs...
Encoding field names...
Saving encoding keys locally...
Saving data to buffer...
        Processed batch 4 of 4
Clustering data in buffer...
[_poll][0] {'status': 200, 'response': {'request_id': 'b65b12fb-c3db-461b-a33b-4c634b7ab22f

  y = column_or_1d(y, warn=True)


In [40]:
score

-0.10337335076591977