# Fashion Recommendation

## Libraries

In [None]:
!pip install gensim==4.0.0
!pip install ml_metrics



In [None]:
import numpy as np
import warnings
import pandas as pd
import gc
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
import pickle
from gensim.test.utils import common_texts, get_tmpfile
from sklearn.decomposition import NMF
from time import perf_counter as pc
from gensim.models import word2vec
import ml_metrics as metrics
from gensim.models.callbacks import CallbackAny2Vec
from scipy.sparse import coo_matrix
warnings.filterwarnings("ignore")



In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


##Metric

The data we have is not a rating data. Instead, the User has information of the products purchased. In order to find out how well the recommendations work, it is necessary to see if the recommendations have been received by the user. Accuracy or MAP@K (Mean Average Precision at K) can be used for this.

In [None]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Read Data

In [None]:
transactions = pd.read_csv("https://storage.googleapis.com/gm-datascience-share/capstone/transactions.csv.gz", dtype={'article_id': str}, parse_dates=['t_dat'])
customers = pd.read_csv("https://storage.googleapis.com/gm-datascience-share/capstone/customers.csv.gz")
products = pd.read_csv("https://storage.googleapis.com/gm-datascience-share/capstone/articles.csv.gz", dtype={'article_id': str})

In [None]:
transactions['sales_channel_id']=transactions.sales_channel_id.astype('int8')
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'], format="%Y-%m-%d")
transactions['article_id']=transactions.article_id.astype('object')

### Add Season and Year Column to Transactions

In [None]:
seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]
month_to_season = dict(zip(range(1,13), seasons))

In [None]:
transactions["season"] = transactions["t_dat"].dt.month.map(month_to_season) 
transactions["year"] = transactions["t_dat"].dt.year

In [None]:
transactions.loc[(transactions["season"]==1) & ((transactions["t_dat"].dt.month==1) | (transactions["t_dat"].dt.month==2)), "year"] -=1 

In [None]:
customers.loc[customers["fashion_news_frequency"] == "NONE", "fashion_news_frequency"] = np.nan
customers.loc[customers["fashion_news_frequency"] == "None", "fashion_news_frequency"] = np.nan
customers["fashion_news_frequency"].unique()

array([nan, 'Regularly', 'Monthly'], dtype=object)

### Train-Test Split

In [None]:
X_train = transactions[transactions["t_dat"] < transactions["t_dat"].max() - timedelta(days=14)]
X_test = transactions[transactions["t_dat"] >= transactions["t_dat"].max() - timedelta(days=7)]

## Part 1: Users who have not purchased anything

The given data is a data consisting of implicit feedbacks. Because of this, we have no information about how many points the user gave a product. We try to predict the products that the user will buy.
Here, an estimation is made for users who have not purchased a product before, based on the age of the user. A calculation is made on the products that people in the same age range buy the most in this year's season and the products they buy the most in the same season last year.

### Create Age Groups

In [None]:
customers['age']=customers.age.astype('float16')
bins= [15,25,35,45,55, 110]
labels = ['[15,25)','[25,35)','[35,45)','[45,55)','[55,110)']
customers['AgeGroup'] = pd.cut(customers['age'], bins=bins, labels=labels, right=False)

### Recommend Favorite products by Age Groups

In [None]:
def favouriteItems(df, interval, ageGroup=None):
  if ageGroup!=None:
    return df[df["AgeGroup"] == ageGroup].groupby("article_id").size().reset_index(name=f'counts{interval}').sort_values(by=[f'counts{interval}'], ascending=False).head(10)
  else:
    return df.groupby("article_id").size().reset_index(name=f'counts{interval}').sort_values(by=[f'counts{interval}'], ascending=False).head(50)

####Top Ten Item

In [None]:
def favouriteItemsTopTen(df, topn=10, ageGroup=None):
  # Last Month
  lastOneMonth = df[(df["t_dat"] >= df["t_dat"].max() - timedelta(days=30))]
  lastOneMonth = pd.merge(lastOneMonth, customers, on=["customer_id"], how="inner")
  last_monthFavourite= favouriteItems(lastOneMonth, "Month", ageGroup)
  
  #Last Year
  lastYear = df[(df["year"] == (df["t_dat"].max() - timedelta(days=365)).year) & (df["season"] == 4)]
  lastYear = pd.merge(lastYear, customers, on=["customer_id"], how="inner")
  last_year_favouite = favouriteItems(lastYear, "Year", ageGroup)

  top_ten = pd.merge(last_year_favouite, last_monthFavourite, on=["article_id"], how="outer").fillna(0)
  top_ten["sum_counts"] = (top_ten['countsYear'] * 0.2) + (top_ten['countsMonth'] * 0.8)
  
  del lastYear, lastOneMonth, last_monthFavourite, last_year_favouite
  gc.collect()
  return top_ten.sort_values(by=[f'sum_counts'], ascending=False).head(topn)["article_id"].values

## PART 2: Product recommendation for the user who has bought something before

### PART 2.1 Non-Negative Matrix Factorization Model

#### Create customer-item map for Coordinate Matrix

In [None]:
customersList = customers['customer_id'].unique().tolist()
productsList = products['article_id'].unique().tolist()

customerDict = dict(list(enumerate(customersList)))
productsDict = dict(list(enumerate(productsList)))

customer_map = {u: customerId for customerId, u in customerDict.items()}
item_map = {i: itemId for itemId, i in productsDict.items()}

X_train['user_id'] = X_train['customer_id'].map(customer_map)
X_train['item_id'] = X_train['article_id'].map(item_map)

#### Time Decay

In [None]:
X_train["timeDecayCount"] = np.exp(-(X_train["t_dat"].max() - X_train["t_dat"]).dt.days * 0.01)

#### Create Coordinate Matrix

In [None]:
row = X_train['user_id'].values
col = X_train['item_id'].values

data = np.array(X_train["timeDecayCount"])
coo = coo_matrix((data, (row, col)), shape=(len(customersList), len(productsList)))
#csr = coo.tocsr()

####Create Non-Negative Matrix Factorization Model

In [None]:
model = NMF(n_components=50, max_iter=100,init="random", random_state=0, tol=0.000001, verbose=True, alpha_W = 0.00001)
start_time = pc()
W = model.fit_transform(coo)
end_time = pc()
print("Used (secs): ", end_time - start_time)
print(model.n_iter_)

violation: 1.0
violation: 2.5030724866635228
violation: 3.250766200819901
violation: 1.3544848932672406
violation: 0.9507060573280439
violation: 0.6632617664355154
violation: 0.4647791791698198
violation: 0.324952277297675
violation: 0.23817753453032728
violation: 0.17642835176592037
violation: 0.13848454949697306
violation: 0.11527289183222028
violation: 0.09914880338257447
violation: 0.0870547836599462
violation: 0.07744765834340013
violation: 0.06984727660845368
violation: 0.06368827946659014
violation: 0.05953335685950895
violation: 0.056259454142109366
violation: 0.05303072705353694
violation: 0.04902655432721998
violation: 0.044139650205520575
violation: 0.039170572603060554
violation: 0.034640128104644556
violation: 0.03069849321466123
violation: 0.02713361965223357
violation: 0.02425849534199902
violation: 0.021780518348668662
violation: 0.019787167320517256
violation: 0.01814639027973071
violation: 0.016905084232846184
violation: 0.015943721230108822
violation: 0.0152183382709

In [None]:
H = model.components_

#### Save-Load Model

Save

In [None]:
import pickle
with open('/content/gdrive/My Drive/Colab Notebooks/WTimeDecayed','wb') as f:
    pickle.dump(W, f)

with open('/content/gdrive/My Drive/Colab Notebooks/HTimeDecayed','wb') as f:
    pickle.dump(H, f)

Load

In [None]:
with open('/content/gdrive/My Drive/Colab Notebooks/WTimeDecayed', 'rb') as f:
    W = pickle.load(f)

with open('/content/gdrive/My Drive/Colab Notebooks/HTimeDecayed', 'rb') as f:
    H = pickle.load(f)

### PART 2.2 Word2Vec Model

#### Create Sentences

In [None]:
sentences = []
for year in X_train["year"].unique():
  for season in X_train[X_train["year"] == year]["season"].unique():
    transaction_temp = X_train[(X_train["year"] == year) & ((X_train["season"] == season))][["customer_id", "article_id"]]
    #transaction_temp = X_train[(X_train["year"] == year)][["customer_id", "article_id"]]

    transaction_temp["article_id"]= transaction_temp["article_id"].values.astype(str)
    transaction_temp = transaction_temp.groupby("customer_id").agg({'article_id': ' '.join})
    for index, row in tqdm(transaction_temp.iterrows()):
      sentences.append(row["article_id"].split(" "))
    
    #sentences.append(transaction_temp.groupby("customer_id").agg({'article_id': ' '.join}).values)

Save sentences

In [None]:
with open("/content/gdrive/My Drive/Colab Notebooks/word2vec", "wb") as fp:   #Pickling
  pickle.dump(sentences, fp)

Load sentences

In [None]:
with open("/content/gdrive/My Drive/Colab Notebooks/word2vec", "rb") as fp:   # Unpickling
  sentences = pickle.load(fp)

#### Create Word2Vec Model

In [None]:
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

model = word2vec.Word2Vec(sentences, vector_size=75, min_count=1,
                          window = 5, hs = 0,
                          negative = 10, # for negative sampling
                          alpha=0.001, min_alpha=0.00001,
                          epochs = 20, sg=0, workers=1, compute_loss=True, callbacks=[callback()])

#### Save-Load Model

Save Model

In [None]:
model.save("/content/gdrive/My Drive/Colab Notebooks/word2vec.model")

Load Model

In [None]:
model = word2vec.Word2Vec.load("/content/gdrive/My Drive/Colab Notebooks/word2vec.model")

## Recommendation

###ItemId to ArticleId

In [None]:
def itemIdToArticleId(X_train, item_idList):
  itemArr = []
  for itemid in item_idList:
    itemArr.append(X_train.loc[X_train["item_id"] == itemid, "article_id"].unique()[0])
  return itemArr

###Word2Vec Prediction

In [None]:
def predictSimilarItem(model, items, recommendedItems):
  #predictWord2VecItems = model.predict_output_word(context_words_list = items, topn=int(topn/2))
  predCount = 4 if len(items) == 1 else 2

  for item in items[-2:]:
    predictWord2VecItems = model.wv.most_similar(item, topn=predCount)
    for predictedItem in predictWord2VecItems:
      recommendedItems.append(predictedItem[0])
    del predictWord2VecItems
  del items, predCount
  return recommendedItems

### Recommendation Engine

In [None]:
test_user = X_test["customer_id"].unique()

averagePrecisionK = 0.0
for user in test_user[0:10]:
  recommendedItems = []
  topn = 10
  if user not in X_train["customer_id"].values:
    ageGroup = customers.loc[customers["customer_id"]==user, "AgeGroup"].values[0]
    print(f"Recommended Item ids for user {user}:")
    recommendedItems = favouriteItemsTopTen(X_train, topn, ageGroup)
    del ageGroup

  else:
    itemInTrainSet = X_test[X_test.article_id.isin(X_train['article_id'])]
    if user in itemInTrainSet["customer_id"].values:
      items = itemInTrainSet.loc[X_test["customer_id"]==user, "article_id"].values.tolist()
      recommendedItems = predictSimilarItem(model, items, recommendedItems)
      recommendedItems = list(set(recommendedItems))

      topn-=len(recommendedItems)
      
    #if user in X_train["customer_id"].values:
    user_i = X_train.loc[X_train["customer_id"] == user, "user_id"].values[0]
    item_pred = W[user_i, :] @ H[:, :]
    item_ids = item_pred.argsort()[::-1][:topn]

    for item in itemIdToArticleId(X_train, item_ids):
      recommendedItems.append(item)

    print(f"Recommended Item ids for user {user}:")
    print(recommendedItems)
    averagePrecisionK += metrics.mapk(X_test.loc[X_test["customer_id"] == user, "article_id"].values, recommendedItems, 10)
    del user_i, item_pred, item_ids, itemInTrainSet
  print(metrics.mapk(X_test.loc[X_test["customer_id"] == user, "article_id"].values, recommendedItems, 10))
  #print(mapk(X_test.loc[X_test["customer_id"] == user, "article_id"], recommendedItems, k=topn))
  gc.collect()

Recommended Item ids for user 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318:
['868691001', '805000007', '866610001', '865914001', '599580055', '811925009', '776237020', '351484039', '823505001', '811927007']
0.10687830687830686
Recommended Item ids for user 00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c:
['718076043', '687524018', '864043003', '502186003', '783346001', '749699002', '806388001', '749699001', '816563001', '817353008']
0.4544973544973545
Recommended Item ids for user 000749135ee9aa3a24c2316ea5ae4f495b39c1653c5612bb5b239f1b2a182a2a:
['800691007', '800691013', '706016001', '706016002', '706016038', '706016006', '706016015', '706016019', '554450001', '621381012']
0.36018518518518516
Recommended Item ids for user 0015f37f752a41a75c3be6f3f92deedc4c87d039f1758ec41f54f8c7f4729793:
['916926005', '920869003', '929673001', '906100002', '811925009', '751471001', '706016001', '866383006', '811927007', '823118001']
0.26488340192043897
Recommended I

In [None]:
averagePrecisionK/10

0.2961480991573584