# Imports

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.7/772.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162678 sha256=731e00b1b1d8d8c8a20d7b177c2fe5f948b7db12c9e3791a7e8c5ffaed3fe320
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scik

In [None]:
import surprise
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split
from sklearn.preprocessing import Binarizer
from sklearn.metrics import precision_score, recall_score, f1_score


# Read data

In [None]:
df = pd.read_csv("df2.csv")
df = df.drop(["Unnamed: 0"], axis=1)
df.columns

Index(['MM.code', 'Short.description', 'Deleted.', 'Stock.Non.stock',
       'Material.type.description', 'ABC.indicator', 'Criticality',
       'Repairable.Not.Repairable', 'BOM.Linkage', 'Total.Installed.Qty',
       'Criticality.based.on.Location',
       'Installed.QTY.under.Critical.Equipment',
       'Plant.section.of.installed.equipment', 'PSGC.Descrp.1',
       'PSGC.Descrp.2', 'Discipline', 'Category', 'MM.Group',
       'Disciplines.for.Stock.changes', 'OSR', 'PDT', 'GR', 'Sap.lead.time',
       'AVG.historical.LT', 'MAX.historical.LT', 'MIN.historical.LT',
       'Last.PO.number', 'Last.PO.Date', 'PO.number', 'Manufacturer.number',
       'Vendor', 'OA', 'OA.start.date', 'OA.end.date', 'OA.validity',
       'Consumption..blank..2010.2021',
       'Consumed.under.High.critical.WO.share', 'Emergency', 'DISP.Qty.',
       'DISP.Value.USD.', 'Total.Disposal.Qty.', 'Final.Disposal.Value.USD.'],
      dtype='object')

# Create Vendor-Category Matrix

In [None]:
df_grouped = df.groupby(["Vendor", "Category"])["Total.Installed.Qty"].sum().reset_index()
df_grouped

Unnamed: 0,Vendor,Category,Total.Installed.Qty
0,0,0,41.0
1,0,Chemicals,118682.0
2,0,Civil,31597.0
3,0,Electrical,77950.0
4,0,Gaskets,6373.0
...,...,...,...
1186,498,Office Stationery,0.0
1187,499,Mechanical,0.0
1188,500,Mechanical,2.0
1189,501,Civil,0.0


# Collaborative Filtering: KNN Basic

In [None]:
surprise_df = df_grouped[['Vendor', 'Category', 'Total.Installed.Qty']]

# Binarize the 'Total.Installed.Qty' column
binarizer = Binarizer(threshold=0.5)
surprise_df['Total.Installed.Qty'] = binarizer.transform([surprise_df['Total.Installed.Qty']])[0]

# Create a Reader object specifying the scale of the ratings (in this case, it's implicit)
reader = Reader(rating_scale=(0, 1))

# Load the dataset into a Surprise object
data = Dataset.load_from_df(surprise_df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7eeade37bc40>

In [None]:
surprise_df

Unnamed: 0,Vendor,Category,Total.Installed.Qty
0,0,0,1.0
1,0,Chemicals,1.0
2,0,Civil,1.0
3,0,Electrical,1.0
4,0,Gaskets,1.0
...,...,...,...
1186,498,Office Stationery,0.0
1187,499,Mechanical,0.0
1188,500,Mechanical,1.0
1189,501,Civil,0.0


In [None]:
surprise_df = df_grouped[['Vendor', 'Category', 'Total.Installed.Qty']]

# Binarize the 'Total.Installed.Qty' column
binarizer = Binarizer(threshold=0.5)
surprise_df['Total.Installed.Qty'] = binarizer.transform([surprise_df['Total.Installed.Qty']])[0]

# Create a Reader object specifying the scale of the ratings (in this case, it's implicit)
reader = Reader(rating_scale=(0, 1))

# Load the dataset into a Surprise object
data = Dataset.load_from_df(surprise_df, reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build the collaborative filtering model using KNNBasic
model = KNNBasic(sim_options={'user_based': False})  # Item-based collaborative filtering

# Fit the model on the training set
model.fit(trainset)

# Evaluate the model using Surprise's built-in evaluation functions
predictions = model.test(testset)

# Function to get top vendors for a given category
def get_top_vendors(category, n=5):
    # Filter the dataset for the given category
    category_data = surprise_df[surprise_df['Category'] == category]

    # Create a list of unique vendors in the selected category
    unique_vendors = category_data['Vendor'].unique()

    # Generate predictions for each vendor in the category
    predictions = [model.predict(uid, category) for uid in unique_vendors]

    # Sort the vendors based on predicted values
    sorted_vendors = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]

    return sorted_vendors

# Example: Get top 3 vendors for the 'Mechanical' category
top_vendors_mechanical = get_top_vendors('Mechanical', n=3)
print(top_vendors_mechanical)

# Convert predictions to binary values for classification
binary_predictions = [1 if pred.est > 0.5 else 0 for pred in predictions]

# Extract true labels from the test set
true_labels = [int(pred.r_ui) for pred in predictions]

rmse = accuracy.rmse(predictions)
fcp = accuracy.fcp(predictions)
mae = accuracy.mae(predictions)

# Calculate precision, recall, and F1 score using scikit-learn
precision = precision_score(true_labels, binary_predictions)
recall = recall_score(true_labels, binary_predictions)
f1 = f1_score(true_labels, binary_predictions)

# Print the evaluation metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Computing the msd similarity matrix...
Done computing similarity matrix.
[Prediction(uid=0, iid='Mechanical', r_ui=None, est=1, details={'actual_k': 14, 'was_impossible': False}), Prediction(uid=1, iid='Mechanical', r_ui=None, est=1, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid=8, iid='Mechanical', r_ui=None, est=1, details={'actual_k': 2, 'was_impossible': False})]
RMSE: 0.5198
FCP:  0.3390
MAE:  0.4173
Precision: 0.6236559139784946
Recall: 0.7785234899328859
F1 Score: 0.6925373134328358


# Matrix Factorization: SVD

In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

# Binarize the 'Total.Installed.Qty' column
binarizer = Binarizer(threshold=0.5)
surprise_df['Total.Installed.Qty'] = binarizer.transform([surprise_df['Total.Installed.Qty']])[0]

# Create a Reader object specifying the scale of the ratings (in this case, it's implicit)
reader = Reader(rating_scale=(0, 1))

# Load the dataset into a Surprise object
data = Dataset.load_from_df(surprise_df, reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build the collaborative filtering model using SVD
model = SVD()

#results_SVD = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Fit the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model using Surprise's accuracy metrics
rmse = accuracy.rmse(predictions)
fcp = accuracy.fcp(predictions)
mae = accuracy.mae(predictions)

# Convert predictions to binary values for classification
binary_predictions = [1 if pred.est > 0.5 else 0 for pred in predictions]

# Extract true labels from the test set
true_labels = [int(pred.r_ui) for pred in predictions]

# Calculate precision, recall, and F1 score using scikit-learn
precision = precision_score(true_labels, binary_predictions)
recall = recall_score(true_labels, binary_predictions)
f1 = f1_score(true_labels, binary_predictions)

# Print the evaluation metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


RMSE: 0.4441
FCP:  0.6486
MAE:  0.3963
Precision: 0.7189189189189189
Recall: 0.8926174496644296
F1 Score: 0.7964071856287426


# Matrix Factorization: SVD Grid Search

In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVD
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import Binarizer

# Binarize the 'Total.Installed.Qty' column
binarizer = Binarizer(threshold=0.5)
surprise_df['Total.Installed.Qty'] = binarizer.transform([surprise_df['Total.Installed.Qty']])[0]

# Create a Reader object specifying the scale of the ratings (in this case, it's implicit)
reader = Reader(rating_scale=(0, 1))

# Load the dataset into a Surprise object
data = Dataset.load_from_df(surprise_df, reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define the parameter grid for grid search
param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.1, 0.2]}

# Create the SVD model
svd = SVD()

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

# Get the best parameters from the grid search
best_params = grid_search.best_params['rmse']

# Create the SVD model with the best parameters
best_svd = SVD(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

# Fit the model on the training set
best_svd.fit(trainset)

# Make predictions on the test set
predictions = best_svd.test(testset)

# Evaluate the model using Surprise's accuracy metrics
rmse = accuracy.rmse(predictions)
fcp = accuracy.fcp(predictions)
mae = accuracy.mae(predictions)

# Convert predictions to binary values for classification
binary_predictions = [1 if pred.est > 0.5 else 0 for pred in predictions]

# Extract true labels from the test set
true_labels = [int(pred.r_ui) for pred in predictions]

# Calculate precision, recall, and F1 score using scikit-learn
precision = precision_score(true_labels, binary_predictions)
recall = recall_score(true_labels, binary_predictions)
f1 = f1_score(true_labels, binary_predictions)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"RMSE with Best Parameters: {rmse}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


RMSE: 0.4377
FCP:  0.6486
MAE:  0.3972
Best Parameters: {'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.2}
RMSE with Best Parameters: 0.4376802192156245
Precision: 0.7228260869565217
Recall: 0.8926174496644296
F1 Score: 0.7987987987987988


In [None]:
from surprise.model_selection import cross_validate
results = cross_validate(best_svd, data, measures=['RMSE', 'MAE', 'FCP'], cv=5, verbose=True)

Evaluating RMSE, MAE, FCP of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4399  0.4389  0.4225  0.4256  0.4440  0.4342  0.0085  
MAE (testset)     0.4018  0.3927  0.3829  0.3914  0.3935  0.3925  0.0060  
FCP (testset)     0.6981  0.6207  0.5676  0.4928  0.5588  0.5876  0.0686  
Fit time          0.02    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from surprise import SVD
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import Binarizer

# Binarize the 'Total.Installed.Qty' column
binarizer = Binarizer(threshold=0.5)
surprise_df['Total.Installed.Qty'] = binarizer.fit_transform([surprise_df['Total.Installed.Qty']])[0]

# Create a Reader object specifying the scale of the ratings (in this case, it's implicit)
reader = Reader(rating_scale=(0, 1))

# Load the dataset into a Surprise object
data = Dataset.load_from_df(surprise_df, reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define the parameter grid for grid search
param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.1, 0.2]}

# Create the SVD model
svd = SVD()

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=6)
grid_search.fit(data)

# Get the best parameters from the grid search
best_params = grid_search.best_params['rmse']

# Create the SVD model with the best parameters
best_svd = SVD(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

# Perform cross-validation
results = cross_validate(best_svd, data, measures=['RMSE', 'MAE', 'FCP'], cv=5, verbose=True)

# Calculate precision, recall, and F1 score using the test set
best_svd.fit(trainset)
predictions = best_svd.test(testset)

true_labels = [int(pred.r_ui) for pred in predictions]
binary_predictions = [1 if pred.est > 0.5 else 0 for pred in predictions]

precision = precision_score(true_labels, binary_predictions)
recall = recall_score(true_labels, binary_predictions)
f1 = f1_score(true_labels, binary_predictions)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Cross-validated RMSE: {results['test_rmse'].mean()}")
print(f"Cross-validated MAE: {results['test_mae'].mean()}")
print(f"Cross-validated FCP: {results['test_fcp'].mean()}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Evaluating RMSE, MAE, FCP of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4392  0.4208  0.4352  0.4320  0.4484  0.4351  0.0090  
MAE (testset)     0.3892  0.3791  0.3902  0.3889  0.3974  0.3890  0.0058  
FCP (testset)     0.5410  0.5957  0.6522  0.5662  0.6190  0.5948  0.0390  
Fit time          0.02    0.05    0.02    0.02    0.02    0.03    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Best Parameters: {'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}
Cross-validated RMSE: 0.4351340749777419
Cross-validated MAE: 0.38896149581274697
Cross-validated FCP: 0.5948319730323277
Precision: 0.7182320441988951
Recall: 0.87248322147651
F1 Score: 0.7878787878787878
