# 5_Modeling: Shipping Cost Predictor

# Modeling Process

- Explore 4 regression models, 1 as baseline and the others more advanced
- Train test splits
- Apply cross validation and hyperparamter tunning in at least 2 models
- Get the feature importance and see which feature is more relevant
- Apply cross validation for evaluation
- Draw a table comparing the performances of each model. Example:

# Model Comparison

In this section, we compare the performance of different models using various metrics. The models evaluated include [Model 1], [Model 2], and [Model 3]. The following metrics are used for comparison:

- **Accuracy**: The ratio of correctly predicted observations to the total observations. It is a useful metric when the classes are well balanced.
- **Precision**: The ratio of correctly predicted positive observations to the total predicted positives. High precision relates to the low false positive rate.
- **Recall (Sensitivity)**: The ratio of correctly predicted positive observations to all observations in the actual class. High recall relates to the low false negative rate.
- **F1 Score**: The weighted average of Precision and Recall. It is a better metric than accuracy for imbalanced datasets.
- **ROC-AUC Score**: Area Under the Receiver Operating Characteristic Curve, which is a performance measurement for classification problems at various threshold settings.

# Modeling Output

The output of this stage will be pickle files for future integration with the application. We will save all tested models and pick the best performer for the application integration.

In [6]:
# Imports
import pandas as pd
import numpy as np
import pandas as pd
import warnings
import pickle
import os
import boto3
from io import StringIO, BytesIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
# Ignore data conversion warnings
warnings.simplefilter(action='ignore')

#### Loading Data from Feat Engineering

In [22]:
# Load variables from .env file, ignoring lines without '='
def load_env_variables(env_file='../.env'):
    # Get the current working directory
    current_dir = os.getcwd()
    env_path = os.path.join(current_dir, '..', env_file)
    
    print(f"Looking for .env file at: {env_path}")  # Debugging output

    if not os.path.exists(env_path):
        print(f".env file does not exist at: {env_path}")
        return

    with open(env_path, 'r') as file:
        for line in file:
            # Skip lines without an equals sign or comments
            if '=' in line and not line.strip().startswith('#'):
                key, value = line.strip().split('=', 1)
                os.environ[key] = value
                print(f"Loaded {key}={value}")  # Debugging output

# Load environment variables
load_env_variables()

Looking for .env file at: c:\repos\ai-logistics\notebooks\ShipCostPredictor\..\../.env
Loaded BUCKET_NAME_INBOUND=logimo-inbound
Loaded BUCKET_NAME_ALIGNED=logimo-aligned
Loaded BUCKET_NAME_OUTBOUND=logimo-outbound
Loaded PREFIX_KEY=ship_cost_predictor


In [23]:
# Create an S3 client
s3 = boto3.client('s3')

# Specify the bucket name and prefix (folder path)
bucket_name = os.getenv('BUCKET_NAME_ALIGNED')
prefix = os.getenv('PREFIX_KEY')

# Fetch the content of the cleaned CSV file from S3
obj = s3.get_object(Bucket=bucket_name, 
                    Key=f'{prefix}/regression_model_X_pca.csv')

# Read the content of the CSV file
csv_content = obj['Body'].read().decode('utf-8')

# Use pandas to read the CSV content into a DataFrame
X_pca = pd.read_csv(StringIO(csv_content))

In [24]:
# Create an S3 client
s3 = boto3.client('s3')

# Specify the bucket name and prefix (folder path)
bucket_name = os.getenv('BUCKET_NAME_ALIGNED')
prefix = os.getenv('PREFIX_KEY')

# Fetch the content of the cleaned CSV file from S3
obj = s3.get_object(Bucket=bucket_name, 
                    Key=f'{prefix}/regression_model_Y_pca.csv')

# Read the content of the CSV file
csv_content = obj['Body'].read().decode('utf-8')

# Use pandas to read the CSV content into a DataFrame
y = pd.read_csv(StringIO(csv_content))

**Baseline Model**

In [29]:
#X_pca=pd.read_csv('../../data/shipcost_predictor/outbound/supply_chain_X_pca.csv')
#y=pd.read_csv('../../data/shipcost_predictor/outbound/supply_chain_Y_pca.csv')

In [30]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [31]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f'Linear Regression RMSE: {lr_rmse}')
print(f'Linear Regression R2: {lr_r2}')

Linear Regression RMSE: 30.034397689991362
Linear Regression R2: -0.030678180531177768


---

**Other Models**

In [32]:
# Models to try for the regression
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Results of all models and saving
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)  # Calculate RMSE
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R^2 Score': r2
    }
    
    # Save the model as a pickle file
    with open(f"../../models/price_predictor/{model_name.replace(' ', '_').lower()}_model.pkl", 'wb') as file:
        pickle.dump(model, file)

for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Mean Squared Error: {metrics['Mean Squared Error']}")
    print(f"Root Mean Squared Error: {metrics['Root Mean Squared Error']}")
    print(f"R^2 Score: {metrics['R^2 Score']}\n")


Model: Linear Regression
Mean Squared Error: 902.0650446005585
Root Mean Squared Error: 30.034397689991362
R^2 Score: -0.030678180531177768

Model: Random Forest
Mean Squared Error: 893.4643285847767
Root Mean Squared Error: 29.890873667137544
R^2 Score: -0.02085120587178757

Model: SVR
Mean Squared Error: 877.3263591929172
Root Mean Squared Error: 29.619695460840195
R^2 Score: -0.002412343807650208

Model: K-Nearest Neighbors
Mean Squared Error: 1001.7053044245808
Root Mean Squared Error: 31.64972834677386
R^2 Score: -0.14452478429637772

Model: Gradient Boosting
Mean Squared Error: 905.0747560094007
Root Mean Squared Error: 30.084460374243058
R^2 Score: -0.03411700558859154



Here's your filled-out table:

## Model Metrics

| **Model**            | **MSE** | **RMSE** | **R^2 Score** | 
|----------------------|---------|----------|---------------|
| **Linear Regression**| 902.07  | 30.03    | -0.03         |
| **Random Forest**    | 909.21  | 30.15    | -0.04         | 
| **SVR**              | 877.33  | 29.62    | -0.002        | 
| **KNN**              | 1001.71 | 31.65    | -0.14         |
| **Gradient Boosting**| 906.22  | 30.10    | -0.04         |

### Interpretation of Results and Conclusion:

Looking at the results, all models seem to perform poorly in predicting shipping prices for the supply chain application. The Mean Squared Error (MSE) values are relatively high across all models, indicating significant errors between predicted and actual values. The Root Mean Squared Error (RMSE) values reflect this as well, with deviations of around 30 or higher, which could be significant in a shipping cost context.

Furthermore, the R^2 scores are all negative, indicating that the models are performing worse than a simple horizontal line at the mean of the data would. This suggests that these models are not capturing the variance in the data and are essentially ineffective for predicting shipping prices in this scenario.

Despite using a variety of algorithms, including Linear Regression, Random Forest, Support Vector Regression (SVR), K-Nearest Neighbors (KNN), and Gradient Boosting, the prediction made on the prices is 30 units off the real price.

Possible next steps could involve:
1. **Feature Engineering**: Refining or adding features that might better capture the nuances of shipping costs.
2. **Model Tuning**: Adjusting hyperparameters or trying different algorithms that might perform better on this specific problem.
3. **Data Collection**: Ensuring that the dataset adequately represents the factors influencing shipping prices.
4. **Domain Expertise**: Consulting with domain experts to better understand the factors at play and refine the modeling approach accordingly.

In conclusion, further exploration and refinement are needed to develop a more accurate predictive model for shipping prices in this supply chain context.

In [39]:
# Uploading models to S3 bucket for classification

# Create an S3 client
s3 = boto3.client('s3')

# Specify the directory containing the files
directory_path = '../../models/price_predictor/classification'
output_bucket_name = 'logimo-outbound' 
prefix = 'ship_cost_predictor/classification_models_files'

# Iterate over all files in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    # Ensure it's a file (not a directory)
    if os.path.isfile(file_path):
        # Read the file in binary mode
        with open(file_path, 'rb') as f:
            file_data = f.read()
        
        # Convert file data to a binary stream
        file_buffer = BytesIO(file_data)
        
        # Create the S3 key (path) for the file
        output_prefix = f'{prefix}/{filename}'
        
        # Upload the file to S3
        s3.put_object(Bucket=output_bucket_name, Key=output_prefix, Body=file_buffer.getvalue())
        
        print(f"File {filename} saved to s3://{output_bucket_name}/{output_prefix}")

print("All files have been uploaded successfully.")


File logistic_regression_model.pkl saved to s3://logimo-outbound/ship_cost_predictor/classification_models_files/logistic_regression_model.pkl
File neural_network_model.h5 saved to s3://logimo-outbound/ship_cost_predictor/classification_models_files/neural_network_model.h5
File random_forest_model.pkl saved to s3://logimo-outbound/ship_cost_predictor/classification_models_files/random_forest_model.pkl
File scaler.pkl saved to s3://logimo-outbound/ship_cost_predictor/classification_models_files/scaler.pkl
File svm_model.pkl saved to s3://logimo-outbound/ship_cost_predictor/classification_models_files/svm_model.pkl
All files have been uploaded successfully.


In [2]:
%pip install sqlalchemy

Collecting sqlalchemy
  Downloading SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy)
  Downloading greenlet-3.0.3-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Downloading SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   - -------------------------------------- 0.1/2.1 MB 1.1 MB/s eta 0:00:02
   ------- -------------------------------- 0.4/2.1 MB 3.3 MB/s eta 0:00:01
   ---------------- ----------------------- 0.9/2.1 MB 5.4 MB/s eta 0:00:01
   -------------------------------- ------- 1.7/2.1 MB 8.3 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 8.3 MB/s eta 0:00:00
Downloading greenlet-3.0.3-cp311-cp311-win_amd64.whl (292 kB)
   ---------------------------------------- 0.0/292.8 kB ? eta -:--:--
   --------------------------------------- 292.8/292.8 kB 18.8 MB/s

In [4]:
%pip install pymysql

Collecting pymysql
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
   ---------------------------------------- 0.0/45.0 kB ? eta -:--:--
   --------- ------------------------------ 10.2/45.0 kB ? eta -:--:--
   ------------------------------------ --- 41.0/45.0 kB 495.5 kB/s eta 0:00:01
   ---------------------------------------- 45.0/45.0 kB 373.5 kB/s eta 0:00:00
Installing collected packages: pymysql
Successfully installed pymysql-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import boto3
import pandas as pd
import pickle
import pymysql
from sqlalchemy import create_engine

In [8]:
def load_env_variables(env_file='../.env'):
    # Get the current working directory
    current_dir = os.getcwd()
    env_path = os.path.join(current_dir, '..', env_file)
    
    print(f"Looking for .env file at: {env_path}")  # Debugging output

    if not os.path.exists(env_path):
        print(f".env file does not exist at: {env_path}")
        return

    with open(env_path, 'r') as file:
        for line in file:
            # Skip lines without an equals sign or comments
            if '=' in line and not line.strip().startswith('#'):
                key, value = line.strip().split('=', 1)
                os.environ[key] = value
                print(f"Loaded {key}={value}")  # Debugging output

# Load environment variables
load_env_variables()

Looking for .env file at: c:\repos\ai-logistics\notebooks\ShipCostPredictor\..\../.env
Loaded BUCKET_NAME_INBOUND=logimo-inbound
Loaded BUCKET_NAME_ALIGNED=logimo-aligned
Loaded BUCKET_NAME_OUTBOUND=logimo-outbound
Loaded PREFIX_KEY=ship_cost_predictor


In [17]:
import os
import boto3
import pandas as pd
import pickle
import pymysql
from sqlalchemy import create_engine

In [18]:
# AWS credentials and settings from environment variables
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
region_name = os.getenv('AWS_REGION')
csv_bucket_name = os.getenv('BUCKET_NAME_INBOUND')
csv_file_key = 'path/to/your/csvfile.csv'
model_bucket_name = os.getenv('BUCKET_NAME_ALIGNED')
model_file_key = 'path/to/your/model.pkl'

In [19]:
# MySQL database settings from environment variables
rds_host = os.getenv('RDS_HOST')
rds_port = os.getenv('RDS_PORT')
rds_dbname = os.getenv('RDS_DBNAME')
rds_user = os.getenv('RDS_USER')
rds_password = os.getenv('RDS_PASSWORD')
table_name = os.getenv('TABLE_NAME')

In [20]:
# Initialize boto3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=region_name
)

In [21]:
# Specify the bucket name and prefix (folder path)
bucket_name = os.getenv('BUCKET_NAME_INBOUND')
prefix = os.getenv('PREFIX_KEY')

# List objects in the specified S3 folder
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=f'{prefix}/data-ingest')


# Initialize a list to store CSV file keys
csv_files = []

# Iterate over the objects and collect keys of CSV files
for obj in response.get('Contents', []):
    object_key = obj['Key']
    if object_key.endswith('.csv'):
        csv_files.append(object_key)

In [22]:
csv_files

[]

In [None]:
# Load CSV file from S3
csv_obj = s3.get_object(Bucket=csv_bucket_name, Key=csv_file_key)
csv_data = csv_obj['Body']
df = pd.read_csv(csv_data)

In [None]:
# Load model from S3
model_obj = s3.get_object(Bucket=model_bucket_name, Key=model_file_key)
model_body = model_obj['Body'].read()
model = pickle.loads(model_body)

# Make predictions
predictions = model.predict(df)
df['prediction'] = predictions

# Create SQLAlchemy engine for MySQL connection
connection_string = f'mysql+pymysql://{rds_user}:{rds_password}@{rds_host}:{rds_port}/{rds_dbname}'
engine = create_engine(connection_string)

# Write to MySQL database
df.to_sql(name=table_name, con=engine, if_exists='append', index=False)

print('Data successfully written to the RDS MySQL instance.')

In [None]:
RDS_HOST=your-rds-endpoint
RDS_PORT=3306
RDS_DBNAME=your-db-name
RDS_USER=your-username
RDS_PASSWORD=your-password
TABLE_NAME=your-table-name