In [1]:
import os
import json
import math
import numpy as np
import pandas as pd
from scipy.spatial import ConvexHull
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def load_json_file(file_path):
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    return json_data

def compute_features(json_data, features):
    strokes = json_data.get('strokes', [])
    computed_features = {}

    for feature in features:
        if feature == 'total_strokes':
            computed_features['total_strokes'] = len(strokes)
        
        elif feature == 'total_points':
            total_points = sum(len(stroke) for stroke in strokes)
            computed_features['total_points'] = total_points
        
        elif feature == 'average_points_per_stroke':
            if len(strokes) > 0:
                avg_points_per_stroke = sum(len(stroke) for stroke in strokes) / len(strokes)
            else:
                avg_points_per_stroke = 0
            computed_features['average_points_per_stroke'] = avg_points_per_stroke
        
        elif feature == 'drawing_duration':
            if strokes:
                start_times = [min(point['t'] for point in stroke) for stroke in strokes if stroke]
                end_times = [max(point['t'] for point in stroke) for stroke in strokes if stroke]
                drawing_duration = max(end_times) - min(start_times)
            else:
                drawing_duration = 0
            computed_features['drawing_duration'] = drawing_duration
        
        elif feature == 'distance_first_to_last_point':
            if strokes:
                first_point = strokes[0][0]
                last_point = strokes[-1][-1]
                distance = math.sqrt((last_point['x'] - first_point['x'])**2 + (last_point['y'] - first_point['y'])**2)
            else:
                distance = 0
            computed_features['distance_first_to_last_point'] = distance
        
        elif feature == 'convex_hull_to_bounding_rectangle_ratio':
            if strokes:
                all_points = [(point['x'], point['y']) for stroke in strokes for point in stroke]
                if all_points:
                    min_x = min(point[0] for point in all_points)
                    max_x = max(point[0] for point in all_points)
                    min_y = min(point[1] for point in all_points)
                    max_y = max(point[1] for point in all_points)
                    bounding_rectangle_area = (max_x - min_x) * (max_y - min_y)
                    if bounding_rectangle_area > 0:
                        hull = ConvexHull(all_points)
                        convex_hull_area = hull.area
                        ratio = convex_hull_area / bounding_rectangle_area
                    else:
                        ratio = 0
                else:
                    ratio = 0
            else:
                ratio = 0
            computed_features['convex_hull_to_bounding_rectangle_ratio'] = ratio

    return computed_features

def process_drawing_files(folder_path, features):
    drawing_data = []
    labels = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.json') and ('bad_A_' in filename or 'good_A_' in filename):
            file_path = os.path.join(folder_path, filename)
            json_data = load_json_file(file_path)
            computed_features = compute_features(json_data, features)

            # Determine label based on filename prefix
            if 'bad_A_' in filename:
                label = 0
            else:
                label = 1

            # Add computed features and label to the lists
            drawing_data.append(computed_features)
            labels.append(label)

    return pd.DataFrame(drawing_data), pd.Series(labels)

# Specify the folder path where the JSON files are located
folder_path = '.'  # Assuming JSON files are in the same directory as this script

# List of features to compute
requested_features = [
    'total_strokes', 
    'total_points', 
    'average_points_per_stroke', 
    'drawing_duration',
    'distance_first_to_last_point',
    'convex_hull_to_bounding_rectangle_ratio'
]

# Process drawing files and extract features and labels
X, y = process_drawing_files(folder_path, requested_features)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize and train a logistic regression model
# model = RandomForestClassifier()
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.2f}")

# # Print classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))


In [2]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a machine learning model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make prediction on the testing data
y_pred = clf.predict(X_test)

# Classification Report
print(classification_report(y_pred, y_test))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.80      0.89         5

    accuracy                           0.86         7
   macro avg       0.83      0.90      0.84         7
weighted avg       0.90      0.86      0.86         7



In [3]:

import shap
import matplotlib.pyplot as plt
instance = 0
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [4]:
import shap
import matplotlib.pyplot as plt
instance = 1
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [5]:
import shap
import matplotlib.pyplot as plt
instance = 2
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [6]:
import shap
import matplotlib.pyplot as plt
instance = 3
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [7]:
import shap
import matplotlib.pyplot as plt
instance = 4
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [8]:
import shap
import matplotlib.pyplot as plt
instance = 5
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])

In [9]:
import shap
import matplotlib.pyplot as plt
instance = 6
ex = shap.KernelExplainer(clf.predict, X_train)
shap_values = ex.shap_values(X_test.iloc[instance,:])
shap.force_plot(ex.expected_value, shap_values, X_test.iloc[instance,:])