#  Walmart Return Optimization project

# Install Libraries

In [3]:
# Install required packages
!pip install pandas numpy scikit-learn lightgbm groq


Defaulting to user installation because normal site-packages is not writeable


 Import all necessary libraries

In [4]:
# Import all necessary libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder , OneHotEncoder

import lightgbm as lgb
import pickle


In [5]:
# Generate synthetic dataset with random lat/lon
n = 2000

data = pd.DataFrame({
    "sku_id": [f"PRD-{i}" for i in range(n)],
    "category": np.random.choice(["Electronics", "Beauty", "Kitchen", "Apparel"], n),
    "original_price": np.random.randint(500, 30000, n),
    "condition_grade": np.random.choice(["A", "B", "C", "D"], n),
    "return_reason_code": np.random.choice(["01", "02", "03", "04"], n),
    "estimated_refurb_cost": np.random.randint(50, 5000, n),
    "resale_value_estimated": np.random.randint(100, 25000, n),
    "inbound_shipping_cost": np.random.randint(20, 500, n),
    "hazardous_goods_flag": np.random.choice(["Yes", "No"], n),
    "co2_saved_refurb_vs_landfill": np.random.uniform(0.1, 5.0, n),
    "final_decision": np.random.choice(
        ["refurbish", "liquidate", "recycle", "keep_it", "donate"],
        n
    ),
    "customer_latitude": np.random.uniform(8, 37, n),
    "customer_longitude": np.random.uniform(68, 97, n)
})

data.to_csv("synthetic_returns_data.csv", index=False)
print(" Synthetic CSV saved.")


 Synthetic CSV saved.


#  Load the Data

In [6]:
# Load the CSV you just generated
df = pd.read_csv("synthetic_returns_data.csv")



In [7]:
print(df.shape)
df.head()

(2000, 13)


Unnamed: 0,sku_id,category,original_price,condition_grade,return_reason_code,estimated_refurb_cost,resale_value_estimated,inbound_shipping_cost,hazardous_goods_flag,co2_saved_refurb_vs_landfill,final_decision,customer_latitude,customer_longitude
0,PRD-0,Apparel,17577,A,1,1151,13588,320,No,4.136504,liquidate,36.175973,78.111838
1,PRD-1,Kitchen,2481,C,4,4053,4363,53,No,1.336327,recycle,26.779193,82.134553
2,PRD-2,Apparel,5870,B,1,4323,7375,296,No,0.505316,donate,26.272797,92.056468
3,PRD-3,Electronics,5080,C,3,3200,8009,314,No,3.611764,refurbish,19.113215,81.718231
4,PRD-4,Electronics,8165,D,3,161,13903,182,No,0.193871,donate,27.430086,70.043635


# Preprocessing

# Encode Categorical Columns

In [8]:
from sklearn.preprocessing import LabelEncoder , OneHotEncoder

In [9]:

categorical_cols = ["category", "condition_grade", 
                    "return_reason_code", "hazardous_goods_flag", "final_decision"]

le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le


# loading of location info csv file 

In [10]:
post_df = pd.read_csv("pincode_with_lat-long.csv")

# Keep only essential columns

post_df = post_df[["OfficeName", "Latitude", "Longitude", "StateName"]]

print(post_df.shape)

post_df.head()

(157126, 4)


  post_df = pd.read_csv("pincode_with_lat-long.csv")


Unnamed: 0,OfficeName,Latitude,Longitude,StateName
0,Peddakotla B.O,14.5689,77.85624,ANDHRA PRADESH
1,Pinnadhari B.O,14.5281,77.857014,ANDHRA PRADESH
2,Yerraguntapalle B.O,14.561111,77.85715,ANDHRA PRADESH
3,Obulareddipalli B.O,14.2488,78.2588,ANDHRA PRADESH
4,Odulapalli B.O,14.24555,78.2477,ANDHRA PRADESH


In [11]:
post_df.tail()

Unnamed: 0,OfficeName,Latitude,Longitude,StateName
157121,Rly Road Meerut SO,28.98,77.68,UTTAR PRADESH
157122,SGMandi SO,28.9724,77.67536,UTTAR PRADESH
157123,W K Road SO,28.99,77.71,UTTAR PRADESH
157124,Kakkoti SO,11.24529,75.778455,KERALA
157125,Kotuvalli SO,11.35,75.91,KERALA


In [12]:
post_df.isnull().sum()

OfficeName       0
Latitude      8838
Longitude     8843
StateName        0
dtype: int64

# Drop rows where lat/lon missing

In [13]:

post_df = post_df.dropna(subset=["Latitude", "Longitude"])

print("Cleaned hubs shape:", post_df.shape)
post_df.head()


Cleaned hubs shape: (148279, 4)


Unnamed: 0,OfficeName,Latitude,Longitude,StateName
0,Peddakotla B.O,14.5689,77.85624,ANDHRA PRADESH
1,Pinnadhari B.O,14.5281,77.857014,ANDHRA PRADESH
2,Yerraguntapalle B.O,14.561111,77.85715,ANDHRA PRADESH
3,Obulareddipalli B.O,14.2488,78.2588,ANDHRA PRADESH
4,Odulapalli B.O,14.24555,78.2477,ANDHRA PRADESH


In [14]:
post_df.isnull().sum()

OfficeName    0
Latitude      0
Longitude     0
StateName     0
dtype: int64

# Compute real-world distance (km) between customer and hub

In [15]:
# import numpy as np

# # Calculate haversine distance in kilometers
# def haversine(lat1, lon1, lat2, lon2):
#     R = 6371
#     lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
#     dlat = lat2 - lat1
#     dlon = lon2 - lon1
#     a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
#     c = 2 * np.arcsin(np.sqrt(a))
#     return R * c


# Compute Nearest Hub Distance----Compute distance from each customer to the nearest hub

In [16]:
# distances = []

# for i, row in df.iterrows():
#     if pd.notnull(row["customer_latitude"]) and pd.notnull(row["customer_longitude"]):

#         lat = float(row["customer_latitude"])
#         lon = float(row["customer_longitude"])
        
#         post_df["dist"] = post_df.apply(
#             lambda hub: haversine(
#                 lat, lon,
#                 hub["Latitude"], hub["Longitude"]
#             ),
#             axis=1
#         )
        
#         min_distance = post_df["dist"].min()
#     else:
#         min_distance = np.nan

#     distances.append(min_distance)

# df["distance_to_nearest_hub"] = distances
# print("✅ Added distance feature.")
# print(df.head())


In [None]:
import numpy as np
import pandas as pd

# -----------------------------
# 1. Clean coordinate strings
# -----------------------------
def clean_latlon(val):
    """
    Convert a string like '21.9161 N' to float,
    flipping sign if S or W.
    """
    if pd.isna(val):
        return np.nan, None   # return None for direction

    val = str(val).strip()

    # Check last character
    if val[-1] in "NSEW":
        num = float(val[:-1].strip())
        if val[-1] in ["S", "W"]:
            return -num, val[-1]
        else:
            return num, val[-1]
    else:
        return float(val), None

# -----------------------------
# 2. Apply cleaning to post_df
# -----------------------------

# Your original post_df:
# e.g. columns: ["OfficeName", "Latitude", "Longitude", ...]

# Create clean numeric lat/lon and store direction letters
post_df["Latitude_clean"], post_df["Latitude_dir"] = zip(*post_df["Latitude"].apply(clean_latlon))
post_df["Longitude_clean"], post_df["Longitude_dir"] = zip(*post_df["Longitude"].apply(clean_latlon))

# -----------------------------
# 3. Define haversine
# -----------------------------
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate distance in km between two lat/lon points.
    """
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# -----------------------------
# 4. Compute distance from each customer to nearest hub
# -----------------------------

distances = []

for i, row in df.iterrows():
    if pd.notnull(row["customer_latitude"]) and pd.notnull(row["customer_longitude"]):
        lat = float(row["customer_latitude"])
        lon = float(row["customer_longitude"])

        # Compute distance to all hubs
        post_df["dist"] = post_df.apply(
            lambda hub: haversine(
                lat,
                lon,
                hub["Latitude_clean"],
                hub["Longitude_clean"]
            ),
            axis=1
        )
        min_distance = post_df["dist"].min()
    else:
        min_distance = np.nan

    distances.append(min_distance)

df["distance_to_nearest_hub"] = distances

# -----------------------------
# 5. Show results
# -----------------------------
print("✅ Added distance feature.")
print(df.head())

# -----------------------------
# Optional - show N/S/E/W
# -----------------------------
# For debugging:
print(post_df[["OfficeName", "Latitude", "Latitude_dir", "Longitude", "Longitude_dir"]].head())


ValueError: could not convert string to float: '31.4398200-'

: 