<a href="https://colab.research.google.com/github/casselscott/API-Data-Axios-/blob/main/Product_Sales_by_Locations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install folium
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
#import webbrowser # No need to open in web browser
from IPython.display import display, IFrame, HTML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import io

# Load processed sales data
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Geocode locations with retry logic
def geocode_location(location):
    geolocator = Nominatim(user_agent="sales_prediction")
    try:
        geo = geolocator.geocode(location, timeout=10)
        if geo:
            return geo.latitude, geo.longitude
    except GeocoderTimedOut:
        time.sleep(1)
        return geocode_location(location)
    return None, None

# Preprocess data
def preprocess_data(df):
    df = df.dropna()

    if 'Latitude' not in df.columns or 'Longitude' not in df.columns:
        print("Geocoding locations...")
        df[['Latitude', 'Longitude']] = df['Location'].apply(lambda loc: pd.Series(geocode_location(loc)))
        print("Geocoding complete.")

    df.dropna(subset=['Latitude', 'Longitude'], inplace=True)

    print("Sample data after geocoding:")
    print(df[['Location', 'Latitude', 'Longitude']].head())

    le = LabelEncoder()
    # Encode the 'Sales_Category' column before splitting into X and y
    df['Sales_Category_Encoded'] = le.fit_transform(df['Sales_Category'])
    df['Location_Encoded'] = le.fit_transform(df['Location'])
    X = df[['Latitude', 'Longitude', 'Sales']]
    # Use the encoded 'Sales_Category' as the target variable
    y = df['Sales_Category_Encoded']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, df, le

# Train Random Forest model
def train_random_forest(X_train, y_train):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    return rf_model

# Train XGBoost model
def train_xgboost(X_train, y_train):
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    return xgb_model

# Train K-Means clustering model
def train_kmeans(X):
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    kmeans.fit(X)
    return kmeans

# Evaluate models
def evaluate_models(rf_model, xgb_model, kmeans, X_test, y_test, X_scaled):
    rf_pred = rf_model.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    kmeans_pred = kmeans.labels_

    rf_acc = accuracy_score(y_test, rf_pred)
    xgb_acc = accuracy_score(y_test, xgb_pred)
    kmeans_silhouette = silhouette_score(X_scaled, kmeans_pred)

    print(f"Random Forest Accuracy: {rf_acc}")
    print(f"XGBoost Accuracy: {xgb_acc}")
    print(f"K-Means Silhouette Score: {kmeans_silhouette}")

    best_model = max([(rf_acc, 'Random Forest'), (xgb_acc, 'XGBoost'), (kmeans_silhouette, 'K-Means')], key=lambda x: x[0])
    print(f"Best Model: {best_model[1]}")
    return best_model[1]

# Predict sales categories for all locations
def predict_sales_categories(model, X, df, le):
    df['Predicted_Sales_Category'] = model.predict(X)
    df['Predicted_Sales_Category'] = df['Predicted_Sales_Category'].map({0: 'Low', 1: 'Medium', 2: 'High'})
    df['Decoded_Location'] = le.inverse_transform(df['Location_Encoded'])
    return df

import folium
from folium.plugins import MarkerCluster

def visualize_results(df):
    m = folium.Map(location=[20, 0], zoom_start=2)

    # Aggregate total sales per location
    sales_by_location = df.groupby(['Decoded_Location', 'Latitude', 'Longitude', 'Predicted_Sales_Category'])['Sales'].sum().reset_index()
    max_sales = sales_by_location['Sales'].max()

    # Define colors for different sales categories
    category_colors = {'High': 'red', 'Medium': 'orange', 'Low': 'blue'}

    # Add a marker cluster for better visualization
    marker_cluster = MarkerCluster().add_to(m)

    for _, row in sales_by_location.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        total_sales = row['Sales']
        location = row['Decoded_Location']
        category = row['Predicted_Sales_Category']

        # Set marker color based on sales category
        color = category_colors.get(category, 'gray')

        # Scale circle size based on total sales (normalized)
        radius = (total_sales / max_sales) * 50  # Normalize sizes

        # Add sales circle
        folium.CircleMarker(
            location=[lat, lon],
            radius=radius,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            popup=f"Location: {location}\nSales: {total_sales}\nCategory: {category}",
              # Add the icon here
            icon=folium.Icon(color="blue", icon="map-marker", prefix='fa')
        ).add_to(marker_cluster)

        # Annotate country name on the map
        folium.Marker(
            location=[lat, lon],
            icon=folium.DivIcon(
                html=f"""
                <div style="font-size: 10px; color: black; background: rgba(255,255,255,0.6); padding: 2px; border-radius: 3px;">
                    {location}
                </div>
                """
            )
        ).add_to(m)

    # Render the map to an in-memory buffer
    f = io.BytesIO()  # Create a BytesIO object
    m.save(f, close_file=False)  # Save the map to the buffer
    html_string = f.getvalue().decode('utf-8')  # Get the HTML content as a string
    display(HTML(m._repr_html_()))  # Display using HTML in Jupyter
    # display(HTML(m._repr_html_()))

# Main execution
def main(file_path):
    df = load_data(file_path)
    X_scaled, y, df, le = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    rf_model = train_random_forest(X_train, y_train)
    xgb_model = train_xgboost(X_train, y_train)
    kmeans_model = train_kmeans(X_scaled)

    best_model_name = evaluate_models(rf_model, xgb_model, kmeans_model, X_test, y_test, X_scaled)
    best_model = rf_model if best_model_name == 'Random Forest' else xgb_model if best_model_name == 'XGBoost' else kmeans_model

    df = predict_sales_categories(best_model, X_scaled, df, le)
    visualize_results(df)
    return best_model

# Run script
if __name__ == "__main__":
    file_path = "processed_sales_data.csv"
    best_model = main(file_path)


Geocoding locations...
Geocoding complete.
Sample data after geocoding:
   Location   Latitude   Longitude
0    BERLIN  52.510885   13.398937
1    LONDON  51.507446   -0.127765
2  NEW YORK  40.712728  -74.006015
3     PARIS  48.858890    2.320041
4     TOKYO  35.676860  139.763895


Parameters: { "use_label_encoder" } are not used.



Random Forest Accuracy: 1.0
XGBoost Accuracy: 1.0
K-Means Silhouette Score: 0.4422085293289516
Best Model: Random Forest
