In [1]:
!pip install pyspark
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp310-cp310-linux_x86_64.whl (174.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 MB[0m [31m182.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.20.1%2Bcpu-cp310-cp310-linux_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m127.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.5.1%2Bcpu-cp310-cp310-linux_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m124.5 MB/s[0m eta [36m0:00:00[0m
Collecting sympy==1.13.1 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K  

In [23]:
# from pyspark.sql import SparkSession

# # Initialize Spark session
# spark = SparkSession.builder \
#     .appName("FoodClassification") \
#     .config("spark.executor.memory", "4g") \
#     .config("spark.driver.memory", "4g") \
#     .getOrCreate()


In [3]:
spark.range(5).show()


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



[Stage 0:>                                                          (0 + 4) / 4]                                                                                

In [10]:
import torch
import torchvision
from torchvision import datasets, transforms
import boto3
import os
from pathlib import Path
import s3fs
import boto3
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import io

In [11]:
# AWS S3 Bucket Configuration
s3_bucket = 'bdafoodimages-unzipped'
train_folder = 'dataset/train'
val_folder = 'dataset/val'
test_folder = 'dataset/test'


# Initialize S3 FileSystem to access the bucket
fs = s3fs.S3FileSystem()


In [12]:
s3_client = boto3.client('s3', region_name='us-east-1')  # Specify the correct region


In [13]:
# List files in S3 folder (returns image paths in the folder)
def list_files_in_s3(bucket_name, folder_path):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    if 'Contents' in response:
        return [content['Key'] for content in response['Contents']]
    else:
        return []

# Fetch image paths from S3
def fetch_images_from_s3(bucket_name, folder_path):
    files = list_files_in_s3(bucket_name, folder_path)
    images = []
    for file in files:
        if file.endswith(('jpg', 'jpeg', 'png')):  # Filter only image files
            images.append(file)
    return images

In [14]:
# Image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize image to 128x128
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize as ImageNet
])


In [15]:
# Custom Dataset to load images from S3
class S3ImageDataset(Dataset):
    def __init__(self, bucket_name, folder_path, transform=None):
        self.bucket_name = bucket_name
        self.folder_path = folder_path
        self.files = fetch_images_from_s3(bucket_name, folder_path)  # List of image paths
        self.transform = transform
        self.class_to_idx = {class_name: idx for idx, class_name in enumerate(sorted(set([file.split('/')[2] for file in self.files])))}
        self.idx_to_class = {idx: class_name for class_name, idx in self.class_to_idx.items()}

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        # Get the S3 object key
        file_path = self.files[idx]
        
        # Download the image from S3 to memory
        response = s3_client.get_object(Bucket=self.bucket_name, Key=file_path)
        img_data = response['Body'].read()
        
        # Open image using PIL
        img = Image.open(io.BytesIO(img_data))

        # Extract the label from the file path (folder name is the label)
        label_str = file_path.split('/')[2]  # Modify this based on your folder structure
        label = self.class_to_idx[label_str]  # Convert label to index
        
        if self.transform:
            img = self.transform(img)
        
        return img, label


In [16]:
# Create DataLoader instances for training, validation, and testing
train_dataset = S3ImageDataset(s3_bucket, train_folder, transform)
val_dataset = S3ImageDataset(s3_bucket, val_folder, transform)
test_dataset = S3ImageDataset(s3_bucket, test_folder, transform)

In [17]:
# Create DataLoaders for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Debugging: Check the first batch
for inputs, labels in train_loader:
    print(f"Batch input shape: {inputs.shape}, Labels: {labels}")
    break

Batch input shape: torch.Size([32, 3, 128, 128]), Labels: tensor([1, 2, 2, 0, 0, 1, 1, 2, 2, 1, 0, 1, 0, 2, 0, 1, 0, 2, 1, 2, 2, 0, 1, 1,
        0, 1, 0, 2, 1, 0, 0, 0])


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 


class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        # Corrected input size for fc1
        self.fc1 = nn.Linear(128 * 32 * 32, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x



# Initialize model
model = SimpleCNN(num_classes=3)  # Adjust the number of classes


In [19]:
# Loss function (CrossEntropyLoss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam optimizer)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [20]:
import torch

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the device (GPU/CPU)
model.to(device)

num_epochs = 10  # Set the number of epochs based on your preference

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        # Move data to the device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        running_loss += loss.item()

        # Get the predictions from the model
        _, predicted = torch.max(outputs, 1)

        # Track correct predictions and total predictions
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Calculate loss and accuracy for this epoch
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    # Print loss and accuracy for this epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    # Add validation code here if desired


Epoch 1/10, Loss: 3.7745, Accuracy: 33.54%
Epoch 2/10, Loss: 1.0693, Accuracy: 44.65%
Epoch 3/10, Loss: 0.8110, Accuracy: 59.40%
Epoch 4/10, Loss: 0.5362, Accuracy: 77.78%
Epoch 5/10, Loss: 0.2923, Accuracy: 90.13%
Epoch 6/10, Loss: 0.1537, Accuracy: 95.85%
Epoch 7/10, Loss: 0.1376, Accuracy: 96.16%
Epoch 8/10, Loss: 0.0541, Accuracy: 99.17%
Epoch 9/10, Loss: 0.0479, Accuracy: 99.07%
Epoch 10/10, Loss: 0.0354, Accuracy: 99.27%


In [21]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Gather all true labels and predictions for test data
true_labels = []
predictions = []
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        true_labels.extend(labels.cpu().numpy())
        predictions.extend(predicted.cpu().numpy())

# Print classification report
print(classification_report(true_labe
conf_matrix = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:\n", conf_matrix)


              precision    recall  f1-score   support

           0       0.11      0.98      0.20        49
           1       0.15      0.84      0.25        45
           2       0.17      0.92      0.29        50
           3       0.00      0.00      0.00        48
           4       0.00      0.00      0.00        50
           5       0.00      0.00      0.00        42
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        49
           8       0.00      0.00      0.00        50
           9       0.00      0.00      0.00        49
          10       0.00      0.00      0.00        46
          11       0.00      0.00      0.00        47
          12       0.00      0.00      0.00        49
          13       0.00      0.00      0.00        47
          14       0.00      0.00      0.00        49
          15       0.00      0.00      0.00        50
          16       0.00      0.00      0.00        50
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
import torch

# Set the model to evaluation mode
model.eval()

# Initialize variables to track the number of correct predictions and total samples
correct = 0
total = 0

# Disable gradient calculation during evaluation for efficiency
with torch.no_grad():
    for inputs, labels in test_loader:
        # Move data to the device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Get the predicted class by choosing the class with the maximum output probability
        _, predicted = torch.max(outputs, 1)

        # Track the number of correct predictions and the total number of samples
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate the accuracy
accuracy = 100 * correct / total

# Print the accuracy on the test set
print(f"Test Accuracy: {accuracy:.2f}%")


Test Accuracy: 13.72%


In [24]:
# Save the trained model
model_save_path = 'food_image_classifier.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to food_image_classifier.pth


In [25]:
# Upload the saved model to S3
model_s3_path = 'models/food_image_classifier.pth'  # Specify the folder and file name in S3
s3_client.upload_file(model_save_path, s3_bucket, model_s3_path)
print(f"Model uploaded to S3 at s3://{s3_bucket}/{model_s3_path}")


Model uploaded to S3 at s3://bdafoodimages-unzipped/models/food_image_classifier.pth


In [26]:
# Load the saved model
model_load_path = 'food_image_classifier.pth'  # Path where the model is saved locally or download from S3
model = SimpleCNN(num_classes=len(train_dataset.class_to_idx))
model.load_state_dict(torch.load(model_load_path))
model.to(device)
print("Model loaded successfully!")


  model.load_state_dict(torch.load(model_load_path))


Model loaded successfully!


In [52]:
from PIL import Image
from torchvision import transforms
import io
import boto3

# AWS S3 Bucket Configuration
s3_bucket = 'bdafoodimages-unzipped'
image_key = 'dataset/val/Alu_Matar/Image_108.jpg'  # Path in your S3 bucket

# Fetch the image from S3
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=s3_bucket, Key=image_key)
img_data = response['Body'].read()

# Open the image using PIL
img = Image.open(io.BytesIO(img_data))

# Define the image transform
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Preprocess the image
img = transform(img).unsqueeze(0)  # Add batch dimension

# Move to the appropriate device
img = img.to(device)

# Perform inference
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(img)
    _, predicted = torch.max(outputs, 1)

# Get the predicted class
predicted_class = train_dataset.idx_to_class[predicted.item()]  # Use your class mapping
print(f"Predicted class: {predicted_class}")


Predicted class: Alu_Gobi


# STREAMLIT

In [43]:
!pip install streamlit




In [44]:
# Create the Streamlit app script (app.py)
streamlit_code = """
import streamlit as st
from PIL import Image
import torch
from torchvision import transforms
from torch import nn

# Define your model architecture (it should be the same as the model you trained)
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=300):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(16 * 64 * 64, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(-1, 16 * 64 * 64)  # Flatten the tensor for the fully connected layer
        x = self.fc1(x)
        return x

# Load the saved model
model = SimpleCNN(num_classes=300)
model.load_state_dict(torch.load('food_image_classifier.pth'))
model.eval()

# Define the transformation used for the image during inference
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Streamlit UI elements
st.title('Food Image Classifier')
st.write("Upload an image to classify.")

# File uploader
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Load and display the image
    image = Image.open(uploaded_file)
    st.image(image, caption='Uploaded Image.', use_column_width=True)
    st.write("")
    
    # Preprocess the image
    img_tensor = transform(image).unsqueeze(0)  # Add batch dimension
    img_tensor = img_tensor.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Make prediction
    with torch.no_grad():
        output = model(img_tensor)
        _, predicted = torch.max(output, 1)
        
    predicted_class = f"Predicted Class: {predicted.item()}"  # You can map this to actual class names
    st.write(predicted_class)
"""

# Save the Streamlit app to a file
with open("app.py", "w") as f:
    f.write(streamlit_code)


In [40]:
with open("streamlit_app.py", "w") as f:
    f.write(streamlit_code)


In [47]:
!ps aux | grep streamlit


ec2-user 18287  0.0  0.0 119860  2732 pts/0    Ss+  11:05   0:00 /bin/sh -c ps aux | grep streamlit
ec2-user 18295  0.0  0.0 119420   928 pts/0    S+   11:05   0:00 grep streamlit
