In [25]:
 #Import Libraries
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import random
import time

In [26]:
# STEP 1: Load Datasets
items_df = pd.read_csv('input_items.csv')
containers_df = pd.read_csv('containers.csv')

In [27]:
# ✅ Rename columns to match CargoPlacer expectations
items_df = items_df.rename(columns={
    'width_cm': 'width',
    'depth_cm': 'depth',
    'height_cm': 'height',
    'item_volume': 'volume'
})


In [28]:
# Check the first 57 rows of the containers dataset
containers_df.head(58)

Unnamed: 0,zone,container_id,width_cm,depth_cm,height_cm
0,Sanitation_Bay,SB01,25,42.5,200
1,Sanitation_Bay,SB02,100,85.0,200
2,Sanitation_Bay,SB03,100,85.0,200
3,Sanitation_Bay,SB04,200,85.0,200
4,Command_Center,CC01,100,85.0,200
5,Command_Center,CC02,100,170.0,200
6,Command_Center,CC03,100,85.0,200
7,Engineering_Bay,EB01,100,85.0,50
8,Engineering_Bay,EB02,25,85.0,200
9,Power_Bay,PB01,100,85.0,200


In [29]:
# Check the first 2000 rows of the input_items dataset
items_df.head(2000)

Unnamed: 0,item_id,name,width,depth,height,mass_kg,priority,expiry_date,usage_limit,preferred_zone
0,1,Research_Samples,26.8,17.5,19.4,2.40,84,,2304,Storage_Bay
1,2,LED_Work_Light,49.9,36.3,44.2,40.03,90,,3558,Maintenance_Bay
2,3,Pressure_Regulator,48.1,33.2,43.1,34.41,16,,1075,Airlock
3,4,Emergency_Oxygen_Mask,15.6,46.5,17.0,6.17,42,,709,Medical_Bay
4,5,Battery_Pack,23.2,31.6,18.2,6.67,93,,175,External_Storage
...,...,...,...,...,...,...,...,...,...,...
1995,1996,Water_Bottle,38.1,18.6,34.3,12.15,97,,2312,Storage_Bay
1996,1997,Water_Purification_Unit,48.2,49.5,47.6,56.78,92,,4555,Life_Support
1997,1998,Thruster_Fuel,25.4,25.1,39.8,12.69,8,,2585,Engine_Bay
1998,1999,Emergency_Oxygen_Mask,49.1,27.9,43.5,29.80,95,,4721,Crew_Quarters


In [30]:
#STEP2
# Calculate volume for items
items_df['item_volume'] = items_df['width'] * items_df['depth'] * items_df['height']

# Calculate volume for containers
containers_df['container_volume'] = containers_df['width_cm'] * containers_df['depth_cm'] * containers_df['height_cm']

# Display first 2000 item volumes
print("📦 First few item volumes:")
print(items_df[['item_id', 'item_volume']].head(2000))

# Display first 57 container volumes
print("\n🚚 First few container volumes:")
print(containers_df[['container_id', 'container_volume']].head(58))

📦 First few item volumes:
      item_id  item_volume
0           1     9098.600
1           2    80062.554
2           3    68827.252
3           4    12331.800
4           5    13342.784
...       ...          ...
1995     1996    24307.038
1996     1997   113568.840
1997     1998    25374.092
1998     1999    59590.215
1999     2000     6524.980

[2000 rows x 2 columns]

🚚 First few container volumes:
   container_id  container_volume
0          SB01          212500.0
1          SB02         1700000.0
2          SB03         1700000.0
3          SB04         3400000.0
4          CC01         1700000.0
5          CC02         3400000.0
6          CC03         1700000.0
7          EB01          425000.0
8          EB02          425000.0
9          PB01         1700000.0
10         PB02          425000.0
11         PB03         1700000.0
12         PB04          850000.0
13         PB05         3400000.0
14         ES01          212500.0
15         ES02          850000.0
16         ES03

In [31]:
print(items_df.columns)

Index(['item_id', 'name', 'width', 'depth', 'height', 'mass_kg', 'priority',
       'expiry_date', 'usage_limit', 'preferred_zone', 'item_volume'],
      dtype='object')


In [32]:
#STEP3(sensitivity)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  # ← this was missing

# Use 'name' column for heuristic similarity
item_names = items_df['name'].astype(str).values

# Convert names into TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(item_names)

# Compute cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)

# Mark items as sensitive if similarity is below a threshold with any other item
# Here we assume low similarity means potential incompatibility (e.g. explosive when mixed)
sensitive_flags = (similarity_matrix < 0.2).sum(axis=1) > 0

items_df['sensitive'] = sensitive_flags.astype(int)

# Show updated dataframe with sensitivity
items_df[['item_id', 'name', 'sensitive']].head(2000)


Unnamed: 0,item_id,name,sensitive
0,1,Research_Samples,1
1,2,LED_Work_Light,1
2,3,Pressure_Regulator,1
3,4,Emergency_Oxygen_Mask,1
4,5,Battery_Pack,1
...,...,...,...
1995,1996,Water_Bottle,1
1996,1997,Water_Purification_Unit,1
1997,1998,Thruster_Fuel,1
1998,1999,Emergency_Oxygen_Mask,1


In [33]:
#STEP3(zone wise categorization)
# Sort items by priority (higher = higher priority)
sorted_items = items_df.sort_values(by='priority', ascending=False)

print("📦 First 2000 high-priority items:")
print(sorted_items[['item_id', 'priority', 'name', 'preferred_zone']].head(2000))


📦 First 2000 high-priority items:
      item_id  priority                  name    preferred_zone
1905     1906       100         First_Aid_Kit       Medical_Bay
78         79       100          CO2_Scrubber   Engineering_Bay
889       890       100          Battery_Pack         Power_Bay
850       851       100      Emergency_Beacon           Cockpit
1923     1924       100           Tether_Reel  External_Storage
...       ...       ...                   ...               ...
1193     1194         1      Gyroscope_Module   Engineering_Bay
708       709         1          Water_Bottle     Crew_Quarters
713       714         1  Microgravity_Lab_Kit               Lab
1474     1475         1          Protein_Bars     Crew_Quarters
1487     1488         1          Seed_Packets        Greenhouse

[2000 rows x 4 columns]


In [34]:
# Add this AFTER loading the containers_df and before STEP4
containers_df['remaining_volume'] = containers_df['width_cm'] * containers_df['depth_cm'] * containers_df['height_cm']

In [35]:
item_container_map = {}  # item_id → container_id

In [36]:
# ✅ Integrated Placement Logic (Volume + Spatial Fit)
item_container_map = {}
container_slots = {}  # Tracks {container_id: {'x': 0, 'y': 0, 'z': 0}}
unplaced_items = []

# Sort items by priority (highest first)
sorted_items = items_df.sort_values(by='priority', ascending=False)

for _, item in sorted_items.iterrows():
    item_id = item['item_id']
    item_w, item_d, item_h = item[['width', 'depth', 'height']]
    preferred_zone = item['preferred_zone']
    placed = False

    # 1. Try preferred zone containers first
    preferred_containers = containers_df[
        (containers_df['zone'] == preferred_zone) &
        (containers_df['remaining_volume'] >= (item_w * item_d * item_h))
    ].sort_values(by='remaining_volume', ascending=False)

    for _, container in preferred_containers.iterrows():
        container_id = container['container_id']
        # Access the correct container dimensions using _cm suffix
        cont_w, cont_d, cont_h = container[['width_cm', 'depth_cm', 'height_cm']]

        # Initialize container slot if not exists
        if container_id not in container_slots:
            container_slots[container_id] = {'x': 0, 'y': 0, 'z': 0}

        slot = container_slots[container_id]

        # Check spatial fit
        if (slot['x'] + item_w <= cont_w) and \
           (slot['y'] + item_d <= cont_d) and \
           (slot['z'] + item_h <= cont_h):

            # Assign coordinates
            item_container_map[item_id] = {
                'container_id': container_id,
                'x_cm': slot['x'],
                'y_cm': slot['y'],
                'z_cm': slot['z']
            }

            # Update slot position (stack vertically first)
            slot['z'] += item_h
            if slot['z'] + item_h > cont_h:
                slot['z'] = 0
                slot['x'] += item_w
                if slot['x'] + item_w > cont_w:
                    slot['x'] = 0
                    slot['y'] += item_d

            # Update container volume
            containers_df.loc[containers_df['container_id'] == container_id, 'remaining_volume'] -= (item_w * item_d * item_h)
            placed = True
            break

    # 2. Fallback to any container
    if not placed:
        candidate_containers = containers_df[
            containers_df['remaining_volume'] >= (item_w * item_d * item_h)
        ].sort_values(by='remaining_volume', ascending=False)

        for _, container in candidate_containers.iterrows():
            container_id = container['container_id']
            # Access the correct container dimensions using _cm suffix
            cont_w, cont_d, cont_h = container[['width_cm', 'depth_cm', 'height_cm']]

            if container_id not in container_slots:
                container_slots[container_id] = {'x': 0, 'y': 0, 'z': 0}

            slot = container_slots[container_id]

            if (slot['x'] + item_w <= cont_w) and \
               (slot['y'] + item_d <= cont_d) and \
               (slot['z'] + item_h <= cont_h):

                item_container_map[item_id] = {
                    'container_id': container_id,
                    'x_cm': slot['x'],
                    'y_cm': slot['y'],
                    'z_cm': slot['z']
                }

                slot['z'] += item_h
                if slot['z'] + item_h > cont_h:
                    slot['z'] = 0
                    slot['x'] += item_w
                    if slot['x'] + item_w > cont_w:
                        slot['x'] = 0
                        slot['y'] += item_d

                containers_df.loc[containers_df['container_id'] == container_id, 'remaining_volume'] -= (item_w * item_d * item_h)
                placed = True
                break

    # 3. Mark as unplaced if both attempts fail
    if not placed:
        unplaced_items.append(item.to_dict())

print(f"✅ Initially placed: {len(item_container_map)}")
print(f"🟨 Unplaced items: {len(unplaced_items)}")

✅ Initially placed: 2000
🟨 Unplaced items: 0


In [37]:
# LIFO Fallback for unplaced items (spatially aware)
print(f"\\n🔄 LIFO fallback for {len(unplaced_items)} items...")

if len(unplaced_items) > 0:
    unplaced_df = pd.DataFrame(unplaced_items).sort_values(by='priority', ascending=False)

    for _, item in unplaced_df.iterrows():
        item_id = item['item_id']
        item_w, item_d, item_h = item['width_cm'], item['depth_cm'], item['height_cm']
        placed = False

        # Find containers sorted by remaining volume (descending)
        candidate_containers = containers_df[
            containers_df['remaining_volume'] >= (item_w * item_d * item_h)
        ].sort_values(by='remaining_volume', ascending=False)

        for _, container in candidate_containers.iterrows():
            container_id = container['container_id']
            cont_w, cont_d, cont_h = container[['width_cm', 'depth_cm', 'height_cm']]

            if container_id not in container_slots:
                container_slots[container_id] = {'x': 0, 'y': 0, 'z': 0}

            slot = container_slots[container_id]

            # Check spatial fit
            if (slot['x'] + item_w <= cont_w) and \
               (slot['y'] + item_d <= cont_d) and \
               (slot['z'] + item_h <= cont_h):

                item_container_map[item_id] = {
                    'container_id': container_id,
                    'x_cm': slot['x'],
                    'y_cm': slot['y'],
                    'z_cm': slot['z']
                }

                # Update slot position
                slot['z'] += item_h
                if slot['z'] + item_h > cont_h:
                    slot['z'] = 0
                    slot['x'] += item_w
                    if slot['x'] + item_w > cont_w:
                        slot['x'] = 0
                        slot['y'] += item_d

                # Update container volume
                containers_df.loc[containers_df['container_id'] == container_id, 'remaining_volume'] -= (item_w * item_d * item_h)
                placed = True
                break

        if not placed:
            print(f"🟥 Item {item_id} failed spatial placement even after LIFO")

print(f"📊 Final success: {len(item_container_map)}/{len(items_df)} ({len(item_container_map)/len(items_df):.1%})")

\n🔄 LIFO fallback for 0 items...
📊 Final success: 2000/2000 (100.0%)


In [38]:
# STEP6A: Coordinate Export with Detailed Reporting

# Get all item IDs from the original dataset
all_item_ids = set(items_df['item_id'])

# Get placed and unplaced item IDs
placed_ids = set(item_container_map.keys())
unplaced_ids = all_item_ids - placed_ids

# Create placed items dataframe
placed_items = []
for item_id, details in item_container_map.items():
    item = items_df[items_df['item_id'] == item_id].iloc[0]
    placed_items.append({
        'item_id': item_id,
        'container_id': details['container_id'],
        'x_cm': details['x_cm'],
        'y_cm': details['y_cm'],
        'z_cm': details['z_cm'],
        'width_cm': item['width'], # Changed from item['width_cm'] to item['width']
        'depth_cm': item['depth'], # Changed from item['depth_cm'] to item['depth']
        'height_cm': item['height'], # Changed from item['height_cm'] to item['height']
        'status': 'PLACED'
    })

# Create unplaced items dataframe
unplaced_items = []
for item_id in unplaced_ids:
    item = items_df[items_df['item_id'] == item_id].iloc[0]
    unplaced_items.append({
        'item_id': item_id,
        'container_id': 'N/A',
        'x_cm': 'N/A',
        'y_cm': 'N/A',
        'z_cm': 'N/A',
        'width_cm': item['width'], # Changed from item['width_cm'] to item['width']
        'depth_cm': item['depth'], # Changed from item['depth_cm'] to item['depth']
        'height_cm': item['height'], # Changed from item['height_cm'] to item['height']
        'status': 'UNPLACED'
    })

# Combine into final dataframes
placed_df = pd.DataFrame(placed_items)
unplaced_df = pd.DataFrame(unplaced_items)

# ... (rest of the code remains the same)
# Print comprehensive report
print("\n📦 PLACEMENT STATUS REPORT")
print(f"✅ Successfully placed items: {len(placed_df)}")
print(f"🟥 Unplaced items: {len(unplaced_df)}")
print(f"📊 Placement success rate: {len(placed_df)/len(items_df):.1%}\n")

# Show sample placed items
print("🔽 FIRST 5 PLACED ITEMS:")
print(placed_df.head(2000).to_string(index=False))
print("\n🔼 LAST 5 PLACED ITEMS:")
print(placed_df.tail(2000).to_string(index=False))

# Show sample unplaced items
if not unplaced_df.empty:
    print("\n🔽 FIRST 5 UNPLACED ITEMS:")
    print(unplaced_df.head().to_string(index=False))
    print("\n🔼 LAST 5 UNPLACED ITEMS:")
    print(unplaced_df.tail().to_string(index=False))
else:
    print("\n🎉 ALL ITEMS WERE SUCCESSFULLY PLACED!")

# Save to separate CSV files
placed_df.to_csv('placed_items.csv', index=False)
unplaced_df.to_csv('unplaced_items.csv', index=False)

print("\n💾 Saved Files:")
print("- placed_items.csv (Full placement details)")
print("- unplaced_items.csv (Items needing attention)")
print(f"📁 Total items processed: {len(placed_df) + len(unplaced_df)}")


📦 PLACEMENT STATUS REPORT
✅ Successfully placed items: 2000
🟥 Unplaced items: 0
📊 Placement success rate: 100.0%

🔽 FIRST 5 PLACED ITEMS:
 item_id container_id  x_cm  y_cm  z_cm  width_cm  depth_cm  height_cm status
    1906         EA01   0.0   0.0   0.0      12.1      23.8       17.8 PLACED
      79         EB01   0.0   0.0   0.0      41.9      30.7       32.0 PLACED
     890         PB05   0.0   0.0   0.0      16.4      49.1       43.4 PLACED
     851          C01   0.0   0.0   0.0      40.2      30.5       11.3 PLACED
    1924         ES03   0.0   0.0   0.0      13.5      43.5       48.2 PLACED
     490         TA01   0.0   0.0   0.0      23.9      45.3       42.8 PLACED
     278          L04   0.0   0.0   0.0      26.6      20.5       40.5 PLACED
    1771         TA01   0.0   0.0  42.8      30.5      38.1       48.0 PLACED
     343         LS05   0.0   0.0   0.0      47.1      12.8       11.0 PLACED
    1701          A03   0.0   0.0   0.0      39.3      38.9       32.7 PLACED
   

In [39]:
print(f"🧩 Total items mapped to containers: {len(item_container_map)}")
print("🔍 Sample assignments:")
for i, (item_id, container_id) in enumerate(item_container_map.items()):
    print(f"  Item {item_id} → Container {container_id}")
    if i == 9: break  # print only 10


🧩 Total items mapped to containers: 2000
🔍 Sample assignments:
  Item 1906 → Container {'container_id': 'EA01', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 79 → Container {'container_id': 'EB01', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 890 → Container {'container_id': 'PB05', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 851 → Container {'container_id': 'C01', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 1924 → Container {'container_id': 'ES03', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 490 → Container {'container_id': 'TA01', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 278 → Container {'container_id': 'L04', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 1771 → Container {'container_id': 'TA01', 'x_cm': 0, 'y_cm': 0, 'z_cm': 42.8}
  Item 343 → Container {'container_id': 'LS05', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}
  Item 1701 → Container {'container_id': 'A03', 'x_cm': 0, 'y_cm': 0, 'z_cm': 0}


In [40]:
print(items_df.columns)
print(items_df.head())


Index(['item_id', 'name', 'width', 'depth', 'height', 'mass_kg', 'priority',
       'expiry_date', 'usage_limit', 'preferred_zone', 'item_volume',
       'sensitive'],
      dtype='object')
   item_id                   name  width  depth  height  mass_kg  priority  \
0        1       Research_Samples   26.8   17.5    19.4     2.40        84   
1        2         LED_Work_Light   49.9   36.3    44.2    40.03        90   
2        3     Pressure_Regulator   48.1   33.2    43.1    34.41        16   
3        4  Emergency_Oxygen_Mask   15.6   46.5    17.0     6.17        42   
4        5           Battery_Pack   23.2   31.6    18.2     6.67        93   

  expiry_date  usage_limit    preferred_zone  item_volume  sensitive  
0         NaN         2304       Storage_Bay     9098.600          1  
1         NaN         3558   Maintenance_Bay    80062.554          1  
2         NaN         1075           Airlock    68827.252          1  
3         NaN          709       Medical_Bay    12331.800

In [41]:
from collections import defaultdict

class CargoPlacer:
    def __init__(self, containers_df, items_df):
        self.containers = containers_df.set_index('container_id')
        self.items = items_df.set_index('item_id')
        self.container_slots = defaultdict(lambda: {'x': 0, 'y': 0, 'z': 0})
        self.accessibility_scores = {
            'Cockpit': 1, 'Command_Center': 2, 'Lab': 3,
            'Medical_Bay': 4, 'Storage_Bay': 5, 'External_Storage': 6,
            'Maintenance_Bay': 7  # Include all zones in your dataset
        }

    def find_placement(self, item_id):
        new_item = self.items.loc[item_id]
        candidates = []
        item_vol = new_item['width'] * new_item['depth'] * new_item['height']
        pref_zone = new_item.get('preferred_zone', None)

        if pref_zone:
            candidates += self._check_zone(pref_zone, new_item, item_vol)

        for zone in self.accessibility_scores:
            if zone != pref_zone:
                candidates += self._check_zone(zone, new_item, item_vol)

        candidates.sort(key=lambda x: x[2])  # sort by score
        return candidates[0] if candidates else None

    def _check_zone(self, zone, item, item_vol):
        candidates = []
        if zone not in self.accessibility_scores:
            return candidates

        zone_conts = self.containers[self.containers['zone'] == zone]
        for cont_id, cont in zone_conts.iterrows():
            if cont['remaining_volume'] >= item_vol:
                score = self.accessibility_scores[zone] * 0.7 + (cont['remaining_volume'] / cont['container_volume']) * 0.3
                position = self._find_position(cont_id, item)
                if position:
                    candidates.append((cont_id, position, score))
        return candidates

    def _find_position(self, cont_id, item):
        slot = self.container_slots[cont_id]
        item_pos = (slot['x'], slot['y'], slot['z'])

        # Update slot for next item (stack vertically)
        slot['z'] += item['height']

        # Deduct volume
        item_vol = item['width'] * item['depth'] * item['height']
        self.containers.at[cont_id, 'remaining_volume'] -= item_vol

        return item_pos

    def place_all_items(self):
        placements = []
        for item_id in self.items.index:
            placement = self.find_placement(item_id)
            if placement:
                cont_id, coords, _ = placement
                placements.append({
                    'item_id': item_id,
                    'container_id': cont_id,
                    'x': coords[0],
                    'y': coords[1],
                    'z': coords[2]
                })
        return placements


In [42]:
# Re-run placement if not already done
placer = CargoPlacer(containers_df, items_df)
placements = placer.place_all_items()

# Convert placements to DataFrame
item_coords_df = pd.DataFrame(placements)

# Optional preview and save
print("✅ Total placed items:", len(item_coords_df))
print(item_coords_df.head())

item_coords_df.to_csv('item_coordinates.csv', index=False)


✅ Total placed items: 118
   item_id container_id  x  y      z
0        1          C01  0  0    0.0
1        2          C01  0  0   19.4
2        3          C01  0  0   63.6
3        4          C04  0  0  106.7
4        5          C04  0  0  123.7


In [43]:
#PREPARE TRAINING DATA
# Extract features and labels
features = items_df.copy()

# Ensure 'container_id' column exists in item_coords_df (the placement result)
features = features.merge(item_coords_df[['item_id', 'container_id']], on='item_id', how='left')

# Drop items that weren’t placed
features = features.dropna(subset=['container_id'])

# Optionally encode zones, priorities, sensitivities if categorical
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
features['container_id'] = label_enc.fit_transform(features['container_id'])  # This is your label (y)


In [44]:
from sklearn.preprocessing import LabelEncoder

# Start from items
features = items_df.copy()

# Merge with placement info
features = features.merge(item_coords_df[['item_id', 'container_id']], on='item_id', how='left')

# Drop items not placed
features = features.dropna(subset=['container_id'])

# Encode container labels for classification
label_enc = LabelEncoder()
features['container_id'] = label_enc.fit_transform(features['container_id'])  # label (y)


In [45]:
#FEATURE ENGINEERING

# Recalculate or rename the 'volume' column, if needed
# Recalculating here to make sure it's up-to-date:
features['volume'] = features['width'] * features['depth'] * features['height']

# Add the sensitivity feature using the same calculation as in earlier steps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use 'name' column for heuristic similarity
item_names = features['name'].astype(str).values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(item_names)
similarity_matrix = cosine_similarity(tfidf_matrix)
sensitive_flags = (similarity_matrix < 0.2).sum(axis=1) > 0
features['sensitivity'] = sensitive_flags.astype(int)

# Now select your features
# Include categorical features using one-hot encoding:
from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for RandomForest

# Encode the 'preferred_zone' column
encoded_zones = encoder.fit_transform(features[['preferred_zone']])
encoded_zones_df = pd.DataFrame(encoded_zones, columns=encoder.get_feature_names_out(['preferred_zone']))

# Concatenate the encoded features with the original DataFrame
features = pd.concat([features, encoded_zones_df], axis=1)


X = features[['volume', 'priority', 'sensitivity'] + list(encoder.get_feature_names_out(['preferred_zone'] ))]  # Add more if needed
y = features['container_id']

# Impute NaN values in 'y' with a placeholder, e.g., -1
y = y.fillna(-1).astype(int)  # or any other suitable value

#TRAINING A CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier object
clf = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Make predictions on the test data using the best estimator
y_pred = best_clf.predict(X_test)

# Evaluate the accuracy of the model
print("🎯 Accuracy:", accuracy_score(y_test, y_pred))



🎯 Accuracy: 0.375


In [46]:
# ... (your existing code) ...

# Import XGBoost
import xgboost as xgb

# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax',  # For multi-class classification
                            num_class=len(label_enc.classes_),  # Number of classes
                            random_state=42)

# Define hyperparameter search space
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Create GridSearchCV object
grid_search = GridSearchCV(xgb_clf, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best estimator
best_xgb_clf = grid_search.best_estimator_

# Make predictions
y_pred = best_xgb_clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy}")



ValueError: 
All the 1215 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1215 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/xgboost/sklearn.py", line 1559, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7 8 9], got [-1  0  1  2  3  4  5  6  7  8]
