importing libraries

In [None]:
## **IMPORTING LIBRARIES**

%pip install --upgrade pandas scikit-learn tqdm matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import joblib


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m116.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Downloading matplotlib-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

importing dataset

In [5]:
import urllib.request
import os

# Create directory for the dataset
os.makedirs('Users/pemphokatsala/DATA', exist_ok=True)

# Download the dataset
dataset_url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/n3gtgm9jxj-2.zip"
zip_path = "Users/pemphokatsala/DATA/waste_classification.zip"

print("Downloading dataset...")
urllib.request.urlretrieve(dataset_url, zip_path)
print(f"Download complete. File saved to {zip_path}")

# Check the file size
file_size = os.path.getsize(zip_path)
print(f"File size: {file_size} bytes ({file_size/1024/1024:.2f} MB)")

Downloading dataset...


Download complete. File saved to Users/pemphokatsala/DATA/waste_classification.zip
File size: 223781993 bytes (213.42 MB)


extracting dataset from zip

In [7]:
import zipfile

extract_path = "/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted"
os.makedirs(extract_path, exist_ok=True)

try:
    print("Extracting zip file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Extraction complete!")

    # List the contents of the extracted directory
    print("\nExtracted contents:")
    for root, dirs, files in os.walk(extract_path, topdown=True):
        level = root.replace(extract_path, '').count(os.sep)
        indent = ' ' * 4 * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = ' ' * 4 * (level + 1)
        for f in files[:5]:  # Show only first 5 files in each directory
            print(f"{sub_indent}{f}")
        if len(files) > 5:
            print(f"{sub_indent}... and {len(files)-5} more files")

except zipfile.BadZipFile:
    print("Error: The file is not a valid zip file.")

Extracting zip file...


Extraction complete!

Extracted contents:
extracted/
    Waste Classification Dataset/
        waste_dataset/
            .amlignore
            .amlignore.amltmp
            waste_dataset_augmentation.ipynb
            waste_dataset_CNN.ipynb
            waste_dataset_README.txt
            organic/
                organic_000001_photo.jpg
                organic_000002_photo.jpg
                organic_000003_photo.jpg
                organic_000004_photo.jpg
                organic_000005_photo.jpg
                ... and 13875 more files
            recyclable/
                recyclable_000001_photo.jpg
                recyclable_000002_photo.jpg
                recyclable_000003_photo.jpg
                recyclable_000004_photo.jpg
                recyclable_000005_photo.jpg
                ... and 10820 more files


 Data cleaning

In [9]:
import os
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.impute import SimpleImputer

def clean_data(dataset_paths):
    """Handles duplicates, missing data, outliers, and inconsistent data in image datasets."""

    all_dfs = []  # List to store DataFrames from each path

    for dataset_path in dataset_paths:
        # Check if path exists
        if not os.path.exists(dataset_path):
            print(f"Warning: Path {dataset_path} does not exist. Skipping...")
            continue

        # Get all image files
        try:
            image_files = [f for f in os.listdir(dataset_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]
        except Exception as e:
            print(f"Error accessing directory {dataset_path}: {e}")
            continue

        # Create dataframe
        data = {'filename': image_files, 'filepath': [os.path.join(dataset_path, f) for f in image_files]}
        df = pd.DataFrame(data)

        if df.empty:
            print(f"No images found in {dataset_path}. Skipping...")
            continue

        print(f"Processing {len(df)} images from {dataset_path}")

        # 1. Handle Duplicates (based on filename)
        before_drop = len(df)
        df.drop_duplicates(subset='filename', keep='first', inplace=True)
        print(f"  - Removed {before_drop - len(df)} duplicate files")

        # 2. Handle Corrupted Images
        df['corrupted'] = False  # Initialize column properly
        corrupted_count = 0

        for index, row in df.iterrows():
            try:
                img = Image.open(row['filepath'])
                img.verify()  # Verify image
                # Also try to load it to catch other potential issues
                img = Image.open(row['filepath'])
                img.load()
            except Exception as e:
                df.at[index, 'corrupted'] = True
                corrupted_count += 1

        print(f"  - Identified {corrupted_count} corrupted images")

        # 3. Handle Missing Data (image dimensions)
        dimensions = []
        for filepath in df['filepath']:
            try:
                if os.path.exists(filepath):
                    img = Image.open(filepath)
                    width, height = img.size
                    dimensions.append((width, height))
                else:
                    dimensions.append((None, None))
            except Exception:
                dimensions.append((None, None))

        df[['width', 'height']] = pd.DataFrame(dimensions, index=df.index)

        # Count missing values before imputation
        missing_width = df['width'].isna().sum()
        missing_height = df['height'].isna().sum()
        print(f"  - Missing dimensions: {missing_width} width, {missing_height} height")

        # Impute missing dimensions if there are any non-missing values
        if not df[['width', 'height']].isna().all().all():
            imputer = SimpleImputer(strategy='median')
            df[['width', 'height']] = imputer.fit_transform(df[['width', 'height']])
            print(f"  - Imputed missing dimensions with median values")

        # 4. Handle Outliers (image dimensions)
        before_outlier = len(df)

        # Only process if we have enough data for meaningful quartiles
        if len(df) > 10:
            # Function to detect and mark outliers
            def mark_outliers(df, column):
                q1 = df[column].quantile(0.25)
                q3 = df[column].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - (1.5 * iqr)
                upper_bound = q3 + (1.5 * iqr)
                return ~((df[column] >= lower_bound) & (df[column] <= upper_bound))

            # Mark outliers for both dimensions
            df['width_outlier'] = mark_outliers(df, 'width')
            df['height_outlier'] = mark_outliers(df, 'height')

            # Remove rows where both width and height are outliers
            outliers = df['width_outlier'] & df['height_outlier']
            df = df[~outliers]

            # Clean up the temporary columns
            df = df.drop(['width_outlier', 'height_outlier'], axis=1)

            print(f"  - Removed {before_outlier - len(df)} outlier images")

        # 5. Handle Inconsistent Data (filename case)
        df['filename'] = df['filename'].str.lower()

        # Remove corrupted images from final dataset
        before_corrupt_removal = len(df)
        df = df[df['corrupted'] == False]
        print(f"  - Removed {before_corrupt_removal - len(df)} corrupted images")

        # Drop the corrupted column
        df = df.drop('corrupted', axis=1)

        print(f"  - Final count: {len(df)} clean images\n")
        all_dfs.append(df)  # Append the cleaned DataFrame to the list

    # Concatenate all DataFrames if we have any
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        print(f"Total clean images across all directories: {len(combined_df)}")
        return combined_df
    else:
        print("No valid images found in any of the provided paths.")
        return pd.DataFrame()  # Return empty DataFrame if no valid data

# Add a category label based on the directory
def add_category_labels(df):
    """Add category labels based on the filepath"""
    df['category'] = df['filepath'].apply(lambda x: os.path.basename(os.path.dirname(x)))
    return df

def save_cleaned_data(cleaned_df, output_base_dir, target_size=(256, 256)):
    """
    Save cleaned data to a new directory structure, preserving categories and standardizing images.

    Args:
        cleaned_df: DataFrame with 'filepath' and 'filename' columns
        output_base_dir: Base directory to save cleaned data
        target_size: Tuple of (width, height) to resize images to

    Returns:
        Dictionary mapping original categories to new save paths
    """
    # Create the base output directory
    os.makedirs(output_base_dir, exist_ok=True)
    print(f"Created output directory: {output_base_dir}")

    # Create a dictionary to track categories and their save paths
    category_paths = {}
    saved_count = 0
    error_count = 0

    # For each file in the cleaned DataFrame
    for idx, row in cleaned_df.iterrows():
        try:
            # Extract category from original filepath
            category = os.path.basename(os.path.dirname(row['filepath']))

            # Create category directory if it doesn't exist in our tracking dict
            if category not in category_paths:
                category_dir = os.path.join(output_base_dir, category)
                os.makedirs(category_dir, exist_ok=True)
                category_paths[category] = category_dir
                print(f"Created category directory: {category_dir}")

            # Load the original image
            img = Image.open(row['filepath'])

            # Standardize to desired format (RGB, specific size)
            img = img.convert('RGB')
            img = img.resize(target_size, Image.Resampling.LANCZOS)

            # Define save path
            save_path = os.path.join(category_paths[category], row['filename'])

            # Save the image
            img.save(save_path)
            saved_count += 1

            # Print progress update for every 100 images
            if saved_count % 100 == 0:
                print(f"Saved {saved_count} images...")

        except Exception as e:
            print(f"Error saving {row['filepath']}: {e}")
            error_count += 1

    print(f"Successfully saved {saved_count} cleaned images")
    if error_count > 0:
        print(f"Encountered errors with {error_count} images")

    # Report counts per category
    for category, path in category_paths.items():
        count = len([f for f in os.listdir(path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))])
        print(f"Category '{category}': {count} images")

    return category_paths

# Main function to execute the entire workflow
def main():
    # Define the paths to your dataset directories
    filepath1 = r'/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/organic'
    filepath2 = r'/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/recyclable'

    # Define output directory for cleaned data
    cleaned_output_dir = "/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/cleaned_waste_dataset"

    # Check if directories exist
    print("Checking dataset paths...")
    for path in [filepath1, filepath2]:
        if os.path.exists(path):
            print(f"Path exists: {path}")
            print(f"Contains {len([f for f in os.listdir(path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))])} images")
        else:
            print(f"Path does not exist: {path}")

    # Only proceed with paths that exist
    valid_paths = [path for path in [filepath1, filepath2] if os.path.exists(path)]

    if valid_paths:
        print("\n=== STEP 1: Cleaning the dataset ===")
        cleaned_df = clean_data(valid_paths)

        if not cleaned_df.empty:
            print("\n=== STEP 2: Adding category labels ===")
            labeled_df = add_category_labels(cleaned_df)
            print("Category distribution:")
            print(labeled_df['category'].value_counts())

            print("\n=== STEP 3: Saving standardized images ===")
            # Save cleaned and standardized images
            category_paths = save_cleaned_data(labeled_df, cleaned_output_dir, target_size=(256, 256))

            print("\n=== Process Complete ===")
            print(f"Cleaned data saved to: {cleaned_output_dir}")
            print("The cleaned dataset is now ready for model training")
        else:
            print("No valid images found after cleaning. Please check your dataset.")
    else:
        print("No valid paths to process. Please check your directory structure.")

# Execute the main function
if __name__ == "__main__":
    main()

Checking dataset paths...
Path exists: /home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/organic


Contains 13880 images
Path exists: /home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/recyclable
Contains 10825 images

=== STEP 1: Cleaning the dataset ===
Processing 13880 images from /home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/organic
  - Removed 0 duplicate files
  - Identified 0 corrupted images
  - Missing dimensions: 0 width, 0 height
  - Imputed missing dimensions with median values
  - Removed 21 outlier images
  - Removed 0 corrupted images
  - Final count: 13859 clean images

Processing 10825 images from /home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/extracted/Waste Classification Dataset/waste_dataset/recyclable
  - Removed 0 duplicate files
  - Identified 0 corrupted images
  - Missing dimensions: 0 width, 0 height
  - Imputed missing dimensions with median values
  - Removed 44 outlier images
  - Removed 0 corrupted images
  - Final count:

image processing

In [28]:
import tensorflow.keras.preprocessing.image 

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [7]:
%pip install scikit-image

Collecting scikit-image
  Downloading scikit_image-0.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting numpy>=1.24 (from scikit-image)
  Downloading numpy-2.2.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting imageio!=2.35.0,>=2.33 (from scikit-image)
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tifffile>=2022.8.12 (from scikit-image)
  Downloading tifffile-2025.3.30-py3-none-any.whl.metadata (32 kB)
Collecting lazy-loader>=0.4 (from scikit-image)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Downloading scikit_image-0.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imageio-2.37.0-py3-none-any.whl (315 kB)
Downloading lazy_loader-0.4-py3-none-any.whl (12 kB)
Downloading numpy-2.2.4-cp310-cp310-many

In [9]:
import numpy as np
import pandas as pd
import os
import pickle
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from skimage.feature import hog
from skimage.color import rgb2gray

def extract_features(image_path):
    """Extract HOG features from 256x256 normalized images"""
    try:
        # Load and standardize image size
        img = load_img(image_path, target_size=(256, 256))

        # Normalize pixel values to [0,1]
        img_array = img_to_array(img) / 255.0

        # Convert to grayscale for HOG
        img_gray = rgb2gray(img_array)

        # Extract HOG features
        return hog(img_gray,
                 orientations=8,
                 pixels_per_cell=(8, 8),
                 cells_per_block=(2, 2),
                 transform_sqrt=True)
    except Exception as e:
        raise RuntimeError(f"Error processing {image_path}: {str(e)}")

def build_dataset(base_dir):
    """Create structured dataset from directory"""
    image_data = []
    class_counts = {'organic': 0, 'recyclable': 0}

    for class_name in ['organic', 'recyclable']:
        class_dir = os.path.join(base_dir, class_name)
        if not os.path.exists(class_dir):
            raise FileNotFoundError(f"Missing directory: {class_dir}")

        for fname in os.listdir(class_dir):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_data.append({
                    'path': os.path.join(class_dir, fname),
                    'label': class_name
                })
                class_counts[class_name] += 1

    print("Dataset composition:")
    print(f"  Organic: {class_counts['organic']} images")
    print(f"  Recyclable: {class_counts['recyclable']} images")
    return pd.DataFrame(image_data)

# Main processing workflow
base_dir = "/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/cleaned_waste_dataset"
df = build_dataset(base_dir)

# Feature extraction
features = []
labels = []
valid_paths = []

print("\nFeature extraction progress:")
for idx, row in df.iterrows():
    try:
        features.append(extract_features(row['path']))
        labels.append(row['label'])
        valid_paths.append(row['path'])
    except Exception as e:
        print(f"Skipped {row['path']}: {str(e)}")

# Convert to numpy arrays
features_array = np.array(features)
labels_array = LabelEncoder().fit_transform(labels)

# Save processed data
output_dir = "/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/processed_waste_dataset"
os.makedirs(output_dir, exist_ok=True)

np.savez(os.path.join(output_dir, 'processed_data.npz'),
         features=features_array,
         labels=labels_array)

pd.DataFrame({'path': valid_paths, 'label': labels})\
  .to_csv(os.path.join(output_dir, 'metadata.csv'), index=False)

print("\nProcessing completed!")
print(f"Final dataset size: {features_array.shape[0]} samples")
print(f"Feature vector length: {features_array.shape[1]}")
print(f"Saved to: {output_dir}")

Dataset composition:
  Organic: 13859 images
  Recyclable: 10781 images

Feature extraction progress:

Processing completed!
Final dataset size: 24640 samples
Feature vector length: 30752
Saved to: /home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/processed_waste_dataset


data scaling

In [12]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

# 1. Load features in memory-mapped mode
features_memmap = np.load('/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/processed_waste_dataset/processed_data.npz', mmap_mode='r')
features = features_memmap['features']

# 2. Initialize StandardScaler
scaler = StandardScaler()

# 3. Chunked fitting
chunk_size = 1000  # Adjust based on available RAM
for i in range(0, len(features), chunk_size):
    chunk = features[i:i+chunk_size]
    scaler.partial_fit(chunk)  # Incremental mean/variance calculation

# 4. Chunked transformation with disk backing
output_path = '/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/standardized_features.dat'
standardized = np.memmap(output_path, dtype=np.float32,
                        mode='w+', shape=features.shape)

for i in range(0, len(features), chunk_size):
    chunk = features[i:i+chunk_size]
    standardized[i:i+chunk_size] = scaler.transform(chunk)

# 5. Verify results
print("Standardized data stats:")
print(f"Mean: {np.mean(standardized):.4f}")
print(f"Std: {np.std(standardized):.4f}")

# 6. Cleanup
del features_memmap  # Release memory map
standardized.flush()  # Ensure data is written to disk
# Load the standardized features
standardized = np.memmap(
    '/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/standardized_features.dat',
    dtype=np.float32,
    mode='r',  # Read-only mode
    shape=features.shape  # Original feature dimensions
)

# Example: First 5 samples
print(standardized[:5])

Standardized data stats:
Mean: -0.0000
Std: 1.0000
[[-0.92890495 -0.6085843  -0.642102   ... -0.65024    -0.6757899
  -0.65263134]
 [ 2.0051558  -0.6085843   1.1146678  ... -0.65024    -0.6757899
  -0.65263134]
 [ 1.7420782   3.8632255   3.4329965  ... -0.65024    -0.6757899
  -0.65263134]
 [-0.92890495 -0.6085843  -0.642102   ... -0.65024    -0.6757899
  -0.65263134]
 [-0.92890495 -0.6085843  -0.642102   ... -0.65024    -0.6757899
  -0.65263134]]


loading X and Y

In [14]:
# Loading the saved data
import numpy as np
pd= r'/home/azureuser/cloudfiles/code/Users/pemphokatsala/DATA/processed_waste_dataset/processed_data.npz'
data = np.load(pd)
X = data['features']
y = data['labels']

training Logistic Regression model

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib
model = LogisticRegression(
    max_iter=200,
   solver='lbfgs',
    #tol=1e-4,
    #random_state=42,
)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model.fit(X_train, y_train)
model.predict(X_test)
joblib.dump(model, 'LR_model.pkl')
#model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['LR_model.pkl']

Training SCV Model

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.utils import shuffle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# 1. Split into train/test FIRST
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Initialize scaler and fit on ENTIRE training data
scaler = StandardScaler()
scaler.fit(X_train)  # Critical: Use full training data for scaling

# 3. Scale all data upfront
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Shuffle training data
X_train_scaled, y_train = shuffle(X_train_scaled, y_train, random_state=42)

# 5. Initialize model correctly
model = SVC()

# 6. Batch training parameters
batch_size = 100  # Increased from 10
num_batches = len(X_train_scaled) // batch_size

# 7. Training loop with validation
for batch_idx in range(num_batches):
    # Get batch
    start = batch_idx * batch_size
    end = start + batch_size
    X_batch = X_train_scaled[start:end]
    y_batch = y_train[start:end]

    # Train incrementally
    model.fit(X_batch, y_batch)

    # Validate on test set
    test_acc = model.score(X_test_scaled, y_test)
    print(f"Batch {batch_idx+1}/{num_batches} | Test Accuracy: {test_acc:.4f}")

# Save model and scaler
joblib.dump(model, 'SVC_model.pkl')
#joblib.dump(scaler, 'scaler.pkl')

Batch 1/197 | Test Accuracy: 0.7108
Batch 2/197 | Test Accuracy: 0.6816
Batch 3/197 | Test Accuracy: 0.7250
Batch 4/197 | Test Accuracy: 0.7372
Batch 5/197 | Test Accuracy: 0.6753
Batch 6/197 | Test Accuracy: 0.7252
Batch 7/197 | Test Accuracy: 0.6903
Batch 8/197 | Test Accuracy: 0.7263
Batch 9/197 | Test Accuracy: 0.6838
Batch 10/197 | Test Accuracy: 0.7474
Batch 11/197 | Test Accuracy: 0.6950
Batch 12/197 | Test Accuracy: 0.7161
Batch 13/197 | Test Accuracy: 0.6122
Batch 14/197 | Test Accuracy: 0.7145
Batch 15/197 | Test Accuracy: 0.6918
Batch 16/197 | Test Accuracy: 0.6392
Batch 17/197 | Test Accuracy: 0.7108
Batch 18/197 | Test Accuracy: 0.6832
Batch 19/197 | Test Accuracy: 0.7228
Batch 20/197 | Test Accuracy: 0.7007
Batch 21/197 | Test Accuracy: 0.6362
Batch 22/197 | Test Accuracy: 0.7242
Batch 23/197 | Test Accuracy: 0.7177
Batch 24/197 | Test Accuracy: 0.5968
Batch 25/197 | Test Accuracy: 0.6658
Batch 26/197 | Test Accuracy: 0.7413
Batch 27/197 | Test Accuracy: 0.6845
Batch 28/1

['SVC_model.pkl']

Checking the models' accuracy

In [17]:
import joblib # This is to load your model
from sklearn.model_selection import train_test_split

# Load the saved model
md = r'/home/azureuser/cloudfiles/code/LR_model.pkl'
md2 = r'/home/azureuser/cloudfiles/code/SVC_model.pkl'
model = joblib.load(md)
model2 = joblib.load(md2)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# Now you can predict and score without fitting again:
predictions = model.predict(X_test)
accuracy = model.score(X_test, y_test)

predictions2 = model2.predict(X_test)
accuracy2 = model2.score(X_test, y_test)

print(f"Model accuracy: {accuracy}")
print(f"Model accuracy: {accuracy2}")

Model accuracy: 0.9435876623376623
Model accuracy: 0.5515422077922078
