In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Exploring the customer transactions dataset

In [3]:
df_one = pd.read_csv('consumer_transanctions.csv')

In [4]:
df_one.shape

(72312, 8)

In [5]:
df_one.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event_timestamp       72312 non-null  int64 
 1   interaction_type      72312 non-null  object
 2   item_id               72312 non-null  int64 
 3   consumer_id           72312 non-null  int64 
 4   consumer_session_id   72312 non-null  object
 5   consumer_device_info  56918 non-null  object
 6   consumer_location     56907 non-null  object
 7   country               56918 non-null  object
dtypes: int64(3), object(5)
memory usage: 4.4+ MB


In [6]:
df_one.head()

Unnamed: 0,event_timestamp,interaction_type,item_id,consumer_id,consumer_session_id,consumer_device_info,consumer_location,country
0,1465413032,content_watched,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,content_watched,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,content_watched,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,content_followed,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,content_watched,-7820640624231356730,-445337111692715325,561148 1178424124714,,,


In [7]:
df_one['interaction_type'].value_counts()

content_watched         61086
content_liked            5745
content_saved            2463
content_commented_on     1611
content_followed         1407
Name: interaction_type, dtype: int64

In [8]:
# dropping columns that are not relevant for the model 
df_one = df_one.drop(['event_timestamp','consumer_session_id','consumer_device_info','consumer_location','country'], axis =1)

In [9]:
df_one.head()

Unnamed: 0,interaction_type,item_id,consumer_id
0,content_watched,-3499919498720038879,-8845298781299428018
1,content_watched,8890720798209849691,-1032019229384696495
2,content_watched,310515487419366995,-1130272294246983140
3,content_followed,310515487419366995,344280948527967603
4,content_watched,-7820640624231356730,-445337111692715325


In [10]:
# creating a function to convert the 'column interaction_type' to numeric values 

def interaction(x):
    if x =='content_liked':
        return 1
    elif x == 'content_saved':
        return 2
    elif x == 'content_commented_on':
        return 3
    elif x == 'content_followed':
        return 4
    else:
        return 5

In [11]:
df_one['interaction_type'] = df_one['interaction_type'].apply(lambda x:interaction(x))

In [12]:
df_one['interaction_type'].value_counts()

5    61086
1     5745
2     2463
3     1611
4     1407
Name: interaction_type, dtype: int64

In [13]:
df_one.head()

Unnamed: 0,interaction_type,item_id,consumer_id
0,5,-3499919498720038879,-8845298781299428018
1,5,8890720798209849691,-1032019229384696495
2,5,310515487419366995,-1130272294246983140
3,4,310515487419366995,344280948527967603
4,5,-7820640624231356730,-445337111692715325


## Exploring the Platform Content Dataframe

In [6]:
df_two = pd.read_csv('platform_content.csv')

In [7]:
df_two.shape

(3122, 13)

In [8]:
df_two.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3122 entries, 0 to 3121
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event_timestamp       3122 non-null   int64 
 1   interaction_type      3122 non-null   object
 2   item_id               3122 non-null   int64 
 3   producer_id           3122 non-null   int64 
 4   producer_session_id   3122 non-null   int64 
 5   producer_device_info  680 non-null    object
 6   producer_location     680 non-null    object
 7   producer_country      680 non-null    object
 8   item_type             3122 non-null   object
 9   item_url              3122 non-null   object
 10  title                 3122 non-null   object
 11  text_description      3122 non-null   object
 12  language              3122 non-null   object
dtypes: int64(4), object(9)
memory usage: 317.2+ KB


In [9]:
df_two['language'].value_counts()

en    2264
pt     850
la       4
ja       2
es       2
Name: language, dtype: int64

In [10]:
df_two['interaction_type'].value_counts()

content_present       3047
content_pulled_out      75
Name: interaction_type, dtype: int64

In [11]:
# the column 'interaction_type' in df_two does not have any significance to our analysis 

In [12]:
df_two = df_two[['item_id', 'title','text_description','language']]

### Creating a dataframe with unique item ids

In [13]:
item_ids = df_two['item_id'].unique().tolist()

In [14]:
unique_item = pd.DataFrame(item_ids)

In [15]:
unique_item.head()

Unnamed: 0,0
0,-6451309518266745024
1,-4110354420726924665
2,-7292285110016212249
3,-6151852268067518688
4,2448026894306402386


In [16]:
unique_item.reset_index(inplace = True)

In [17]:
unique_item.head()

Unnamed: 0,index,0
0,0,-6451309518266745024
1,1,-4110354420726924665
2,2,-7292285110016212249
3,3,-6151852268067518688
4,4,2448026894306402386


In [18]:
unique_item.columns = ['item_no','item_id']

In [19]:
unique_item.head()

Unnamed: 0,item_no,item_id
0,0,-6451309518266745024
1,1,-4110354420726924665
2,2,-7292285110016212249
3,3,-6151852268067518688
4,4,2448026894306402386


In [20]:
unique_item.shape

(3057, 2)

In [21]:
# Merging the dataframes : unique_item and df_two

In [22]:
temp = pd.merge(unique_item, df_two, on='item_id', how='left')

In [24]:
temp.head(20)

Unnamed: 0,item_no,item_id,title,text_description,language
0,0,-6451309518266745024,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1,-4110354420726924665,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,2,-7292285110016212249,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,3,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,4,2448026894306402386,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,5,-2826566343807132236,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en
6,6,-2148899391355011268,Banks Need To Collaborate With Bitcoin and Fin...,It will take time until banks come around to t...,en
7,7,4119190424078847945,Blockchain Technology Could Put Bank Auditors ...,When most people think about computers and rob...,en
8,8,-7926018713416777892,Why Decentralized Conglomerates Will Scale Bet...,"Bitcoin.com spoke with the OpenLedger CEO, Ron...",en
9,9,3353902017498793780,The Rise And Growth of Ethereum Gets Mainstrea...,"Ethereum, considered by many to be the most pr...",en


In [25]:
temp.title[1]

"Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's"

In [27]:
temp.text_description[1]

'All of this work is still very early. The first full public version of the Ethereum software was recently released, and the system could face some of the same technical and legal problems that have tarnished Bitcoin. Many Bitcoin advocates say Ethereum will face more security problems than Bitcoin because of the greater complexity of the software. Thus far, Ethereum has faced much less testing, and many fewer attacks, than Bitcoin. The novel design of Ethereum may also invite intense scrutiny by authorities given that potentially fraudulent contracts, like the Ponzi schemes, can be written directly into the Ethereum system. But the sophisticated capabilities of the system have made it fascinating to some executives in corporate America. IBM said last year that it was experimenting with Ethereum as a way to control real world objects in the so-called Internet of things. Microsoft has been working on several projects that make it easier to use Ethereum on its computing cloud, Azure. "Et

In [32]:
temp.shape

(3122, 5)

### Creating a dataframe with unique consumer ids

In [33]:
con_ids = df_one['consumer_id'].unique().tolist()

In [34]:
unique_con = pd.DataFrame(con_ids)

In [35]:
unique_con.head()

Unnamed: 0,0
0,-8845298781299428018
1,-1032019229384696495
2,-1130272294246983140
3,344280948527967603
4,-445337111692715325


In [36]:
unique_con.reset_index(inplace = True)

In [37]:
unique_con.head()

Unnamed: 0,index,0
0,0,-8845298781299428018
1,1,-1032019229384696495
2,2,-1130272294246983140
3,3,344280948527967603
4,4,-445337111692715325


In [38]:
unique_con.columns = ['consumer_no','consumer_id']

In [39]:
unique_con.head()

Unnamed: 0,consumer_no,consumer_id
0,0,-8845298781299428018
1,1,-1032019229384696495
2,2,-1130272294246983140
3,3,344280948527967603
4,4,-445337111692715325


In [40]:
df_one.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   interaction_type  72312 non-null  int64
 1   item_id           72312 non-null  int64
 2   consumer_id       72312 non-null  int64
dtypes: int64(3)
memory usage: 1.7 MB


In [41]:
df_two.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3122 entries, 0 to 3121
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_id           3122 non-null   int64 
 1   title             3122 non-null   object
 2   text_description  3122 non-null   object
 3   language          3122 non-null   object
dtypes: int64(1), object(3)
memory usage: 97.7+ KB


In [42]:
# Merging the dataframes : temp and df_one
merged_temp = pd.merge(df_one,unique_con, on ='consumer_id', how ='left')

In [43]:
merged_temp.head()

Unnamed: 0,interaction_type,item_id,consumer_id,consumer_no
0,5,-3499919498720038879,-8845298781299428018,0
1,5,8890720798209849691,-1032019229384696495,1
2,5,310515487419366995,-1130272294246983140,2
3,4,310515487419366995,344280948527967603,3
4,5,-7820640624231356730,-445337111692715325,4


In [44]:
merged_temp.shape

(72312, 4)

In [45]:
merged_temp = merged_temp.drop('consumer_id', axis = 1)

In [46]:
merged_temp.head()

Unnamed: 0,interaction_type,item_id,consumer_no
0,5,-3499919498720038879,0
1,5,8890720798209849691,1
2,5,310515487419366995,2
3,4,310515487419366995,3
4,5,-7820640624231356730,4


In [47]:
# creating the final dataframe by merging the the dataframes : merged_temp and temp
merged = pd.merge(merged_temp,temp, on ='item_id', how ='left')

In [48]:
merged.head()

Unnamed: 0,interaction_type,item_id,consumer_no,item_no,title,text_description,language
0,5,-3499919498720038879,0,1247,Hiri wants to fix the workplace email problem,Hiri is the latest startup trying to fix email...,en
1,5,8890720798209849691,1,1236,Top 10 Intranet Trends of 2016,"Summary: Hero images, carousels, fat footers, ...",en
2,5,310515487419366995,2,1267,71 erros de português que precisam sumir dos s...,Escrever um e-mail não deveria ser uma coisa t...,pt
3,4,310515487419366995,3,1267,71 erros de português que precisam sumir dos s...,Escrever um e-mail não deveria ser uma coisa t...,pt
4,5,-7820640624231356730,4,1271,How This Googler Redesigned The Workweek,Jake Knapp has always been concerned about the...,en


In [49]:
merged = merged.drop('item_id', axis =1)

In [50]:
merged.head()

Unnamed: 0,interaction_type,consumer_no,item_no,title,text_description,language
0,5,0,1247,Hiri wants to fix the workplace email problem,Hiri is the latest startup trying to fix email...,en
1,5,1,1236,Top 10 Intranet Trends of 2016,"Summary: Hero images, carousels, fat footers, ...",en
2,5,2,1267,71 erros de português que precisam sumir dos s...,Escrever um e-mail não deveria ser uma coisa t...,pt
3,4,3,1267,71 erros de português que precisam sumir dos s...,Escrever um e-mail não deveria ser uma coisa t...,pt
4,5,4,1271,How This Googler Redesigned The Workweek,Jake Knapp has always been concerned about the...,en


In [51]:
merged.shape

(72680, 6)

### Checking Skewness in the Data

In [52]:
# checking for the duplicate data in the dataframe : 'merged'
merged.duplicated().sum()

21731

In [53]:
merged = merged.drop_duplicates()

In [54]:
merged.duplicated().sum()

0

In [55]:
merged.shape

(50949, 6)

In [56]:
merged['interaction_type'].value_counts()

5    40248
1     5673
2     2202
3     1427
4     1399
Name: interaction_type, dtype: int64

In [57]:
# checking the number of cosumers that have gone through different items
merged['item_no'].value_counts(0)

3031    345
1767    251
1628    224
1827    201
1854    190
       ... 
557       1
247       1
311       1
1665      1
2944      1
Name: item_no, Length: 2987, dtype: int64

In [58]:
# creating articles that have been read by at least 5 users 
merged = merged.groupby('item_no').filter(lambda x: len(x) > 4)

In [59]:
merged['item_no'].value_counts()

3031    345
1767    251
1628    224
1827    201
1854    190
       ... 
1597      5
1335      5
347       5
1709      5
1608      5
Name: item_no, Length: 2241, dtype: int64

In [60]:
merged.shape

(49142, 6)

In [61]:
merged.nunique()

interaction_type       5
consumer_no         1885
item_no             2241
title               2222
text_description    2230
language               4
dtype: int64

## Creating a Model Evaluator

In [62]:
# storing all the item numbers in 'all_items'
all_items = set(merged['item_no'])

In [63]:
# storing all the consumer numbers in 'all_consumers'
all_consumers = set(merged['consumer_no'])

In [64]:
# creating a function to find the items consumed by a specifc consumer
def consumed_items(no, df):
    consumed_items = df.loc[df.consumer_no == no , 'item_no'].unique()
    return set(consumed_items)

In [65]:
# creating a function to find the items not consumed by a specific consumer
def non_consumed_items(no,df):
    temp = consumed_items(no, df)
    non_consumed_items = all_items - temp
    return set(non_consumed_items)

In [66]:
# creating a function for evalutaion of model accuracy for each customer evaluation
def evaluation(no, df):
    temp = []
    metrics = {}
    one = consumed_items(no,df)
    two = non_consumed_items(no,df)
    remaining = two-one
    
    recall_top_five = 0
    recall_top_ten = 0
    
    for i in df['item_no']:
        if i in one:
            temp.append(i)
    
    top_five = list(temp[:6])
    top_ten = list(temp[:11])
    
    if len(one) != 0:
        recall_top_five = len(top_five) / len(one)
        recall_top_ten = len(top_ten) / len(one)

    metrics = {'recall_top_five' :round(recall_top_five,2) ,'recall_top_ten' : round(recall_top_ten,2) }
    
    return metrics 

In [67]:
# creating a function for evaluation of the recommendations 
def model_evaluation(no,df):
    
    five = 0
    ten = 0
    
    temp = []
    metrics = {}
    
    for item in df['item_no']:
        temp = list(df['consumer_no'])
        
    for consumer in temp:
        model = evaluation(consumer,df)
        five = five + model['recall_top_five']
        ten = ten + model['recall_top_ten']
        
    final_five = five / len(temp)
    final_ten = ten / len(temp)
    
    metrics = {'recall_top_five' :round(final_five,2) ,'recall_top_ten' : round(final_ten,2) }
    
    return metrics 

We are using recall for model evaluation as we are only concerned with the relevant results that should be appearing in our final output.

## Splitting in train and test data sets

In [68]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(merged, test_size = 0.3, random_state = 30)

In [69]:
print(train.shape)
print(test.shape)

(34399, 6)
(14743, 6)


## Content based Recommendation

In [70]:
#importing relevant libraries
from nltk.corpus import stopwords

In [71]:
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

In [72]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [73]:
# creating an array of words from the column text_description 
keyword_array = (train['text_description'] + train ['title']).to_numpy()

In [74]:
keyword_array

array(['Toda vez que vemos alguma biblioteca nova de carregamento de imagem, nos perguntamos: outra? Com certeza não foi diferente com o Fresco que, por um tempo, sofreu bastante "chacota" da comunidade pelo simples motivo: Picasso\'s API has 24 classes. . Fresco\'s API has 24 packages. Mas, antes de rir, vamos entender o motivo de tudo isso. Nascimento Nós, usuários do Facebook e Instagram, exigimos exibição rápida e eficiente de imagem ao abrirmos nossos aplicativos. O Fresco nasceu exatamente com essa função: exibir a imagem de maneira eficiente, não importando o device ou conexão. Mas, qual exatamente é o problema? Bem, as imagens geralmente são grandes e os devices " pequenos ". Precisamos exibir uma imagem com a resolução 2560 × 1600 , com 4mb de tamanho, em um device com resolução desconsiderável. Mesmo com todo o processamento de imagem disponível, nos parece bem caótico, não é? Isso significa, no universo Android, que um simples device de 480 x 800 pixels, pode ocupar 1.5mb da

In [75]:
# Creating a list of words from the keyword_array
keyword_list = []

for keyword in keyword_array:
    
    splitted_words = str(keyword).lower().split()
    
    keyword_list.append(splitted_words)

In [76]:
len(keyword_list)

34399

In [77]:
keyword_list_final = []

In [78]:
# Removing the stopwords

for word in keyword_list:
    if word not in stopwords_list:
        keyword_list_final.append(word)

### Creating Dictionary, Bag of Words, tfidf Model and Similarity Matrix

In [79]:
from gensim.corpora.dictionary import Dictionary

In [80]:
# creating a dictionary from the words list 
dictionary = Dictionary(keyword_list_final)

In [81]:
# creating the corpus
corpus = [dictionary.doc2bow(doc) for doc in keyword_list_final]

In [82]:
from gensim.models.tfidfmodel import TfidfModel

In [83]:
# creating a tfidf model of the corpus
tfidf = TfidfModel(corpus)

In [84]:
from gensim.similarities import MatrixSimilarity

In [85]:
# creating the similarity matrix to get the similarity between items
sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))

In [86]:
print(sims)

MatrixSimilarity<34399 docs, 136078 features>


### For input item, looking for recommended items

In [87]:
train['title'].iloc[2]

'From E-Commerce to Web 3.0: Let the Bots Do the Shopping'

In [143]:
def item_recommendation(title):
    
    # getting the item row
    item = train.loc[train.title==title] 
    
    #splitting the keywords
    keywords = train['text_description'].iloc[11].split() 

    #setting the list of words to query_doc
    query_doc = keywords 
    
    # getting a bag of words from the query_doc
    query_doc_bow = dictionary.doc2bow(query_doc) 

    #converting the regular bag of words model to a tf-idf model
    query_doc_tfidf = tfidf[query_doc_bow] 

    # getting similarity values between input item and all other item
    similarity_array = sims[query_doc_tfidf] 

    # converting to a Series
    similarity_series = pd.Series(similarity_array.tolist(), index=train.title.values) 

    #getting the most similar items 
    similarity_output = similarity_series.sort_values(ascending=False)
    
    return similarity_output

In [89]:
content_output = item_recommendation('Setting Up HTTP(S) Load Balancing')

In [90]:
content_output.duplicated().sum()

32182

In [91]:
content_output = content_output.drop_duplicates()

In [92]:
content_output.head()

O homofóbico dentro de cada um                                                 0.949535
[Books] Biblioteca FIXE: Sugestões de Leitura                                  0.324373
SINAIS do FUTURO imediato: #1, internet das coisas                             0.309699
Por dentro do Nubank, conheça os segredos da fintech mais festejada do país    0.286833
A saga do jovem CEO da Dasa                                                    0.284552
dtype: float64

In [93]:
content_df = pd.DataFrame(content_output)

In [94]:
content_df.head()

Unnamed: 0,0
O homofóbico dentro de cada um,0.949535
[Books] Biblioteca FIXE: Sugestões de Leitura,0.324373
"SINAIS do FUTURO imediato: #1, internet das coisas",0.309699
"Por dentro do Nubank, conheça os segredos da fintech mais festejada do país",0.286833
A saga do jovem CEO da Dasa,0.284552


In [95]:
content_df.reset_index(inplace=True)

In [96]:
content_df.columns = ['title', 'score']

In [97]:
content_df.head()

Unnamed: 0,title,score
0,O homofóbico dentro de cada um,0.949535
1,[Books] Biblioteca FIXE: Sugestões de Leitura,0.324373
2,"SINAIS do FUTURO imediato: #1, internet das co...",0.309699
3,"Por dentro do Nubank, conheça os segredos da f...",0.286833
4,A saga do jovem CEO da Dasa,0.284552


In [98]:
content_df['content_score_normalized'] = (content_df['score']-min(content_df['score'])) / (max(content_df['score']) - min(content_df['score']))

In [99]:
content_df.head()

Unnamed: 0,title,score,content_score_normalized
0,O homofóbico dentro de cada um,0.949535,1.0
1,[Books] Biblioteca FIXE: Sugestões de Leitura,0.324373,0.341612
2,"SINAIS do FUTURO imediato: #1, internet das co...",0.309699,0.326158
3,"Por dentro do Nubank, conheça os segredos da f...",0.286833,0.302077
4,A saga do jovem CEO da Dasa,0.284552,0.299675


In [100]:
content_df.shape

(2217, 3)

## Collaborative Recommendation

### User item profile

In [101]:
data_matrix = train.pivot_table(
    index='item_no',
    columns='consumer_no',
    values='interaction_type'
).fillna(0)

In [102]:
data_matrix.head()

consumer_no,0,1,2,3,4,5,6,7,8,9,...,1881,1882,1883,1885,1886,1887,1890,1891,1893,1894
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
data_matrix.shape

(2240, 1779)

### Pairwise Distance

In [104]:
from sklearn.metrics.pairwise import pairwise_distances 

In [105]:
item_similarity = 1 - pairwise_distances(data_matrix, metric = 'cosine')

In [106]:
item_similarity

array([[1.        , 0.        , 0.11572751, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.2236068 , ..., 0.11439056, 0.        ,
        0.1490712 ],
       [0.11572751, 0.2236068 , 1.        , ..., 0.12789253, 0.19148936,
        0.16666667],
       ...,
       [0.        , 0.11439056, 0.12789253, ..., 1.        , 0.        ,
        0.17052337],
       [0.        , 0.        , 0.19148936, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.1490712 , 0.16666667, ..., 0.17052337, 0.        ,
        1.        ]])

In [107]:
item_similarity.shape

(2240, 2240)

### Getting similar items to recommend

In [108]:
item_index = 11

In [109]:
item_prediction = pd.DataFrame(item_similarity)

In [110]:
item_prediction.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2230,2231,2232,2233,2234,2235,2236,2237,2238,2239
0,1.0,0.0,0.115728,0.0,0.151812,0.0,0.0,0.0,0.135208,0.059984,...,0.015973,0.0,0.157485,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.223607,0.0,0.0,0.0,0.0,0.182574,0.266472,0.193167,...,0.092589,0.08498,0.0,0.128711,0.2,0.145172,0.0,0.114391,0.0,0.149071
2,0.115728,0.223607,1.0,0.0,0.327952,0.0,0.0,0.0,0.17525,0.561514,...,0.103517,0.095011,0.0,0.143904,0.223607,0.162307,0.0,0.127893,0.191489,0.166667
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.051383,0.0,0.101322,0.0,0.0,0.0,0.0,0.105804,0.0,0.0
4,0.151812,0.0,0.327952,0.0,1.0,0.0,0.0,0.0,0.0,0.169984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
item_prediction.shape

(2240, 2240)

In [112]:
item_prediction.iloc[item_index]

0       0.000000
1       0.000000
2       0.000000
3       0.413646
4       0.000000
          ...   
2235    0.000000
2236    0.000000
2237    0.127893
2238    0.000000
2239    0.000000
Name: 11, Length: 2240, dtype: float64

In [113]:
item_recommendation = pd.DataFrame(item_prediction.iloc[item_index].sort_values(ascending=False))

In [114]:
item_recommendation.head()

Unnamed: 0,11
11,1.0
2154,0.707107
720,0.60833
562,0.593732
987,0.582113


In [115]:
item_recommendation.reset_index(inplace=True)

In [116]:
item_recommendation.columns = ['item_no','score']

In [117]:
item_recommendation.head(10)

Unnamed: 0,item_no,score
0,11,1.0
1,2154,0.707107
2,720,0.60833
3,562,0.593732
4,987,0.582113
5,461,0.509478
6,1484,0.50871
7,1380,0.5
8,1580,0.5
9,894,0.5


### Merging 'item_no' with item 'title'

In [118]:
merged_one = pd.merge(item_recommendation, merged,  on='item_no', how='left')

In [119]:
merged_one.head(10)

Unnamed: 0,item_no,score,interaction_type,consumer_no,title,text_description,language
0,11,1.0,5.0,6.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
1,11,1.0,5.0,72.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
2,11,1.0,5.0,43.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
3,11,1.0,5.0,75.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
4,11,1.0,5.0,78.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
5,11,1.0,5.0,1117.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
6,2154,0.707107,,,,,
7,720,0.60833,,,,,
8,562,0.593732,5.0,602.0,What exercise does to your bones,The force on a triple jumper's bones is 15 tim...,en
9,562,0.593732,1.0,602.0,What exercise does to your bones,The force on a triple jumper's bones is 15 tim...,en


In [120]:
merged_one = merged_one[['item_no','title','consumer_no','score']]

In [121]:
merged_one.head()

Unnamed: 0,item_no,title,consumer_no,score
0,11,Setting Up HTTP(S) Load Balancing,6.0,1.0
1,11,Setting Up HTTP(S) Load Balancing,72.0,1.0
2,11,Setting Up HTTP(S) Load Balancing,43.0,1.0
3,11,Setting Up HTTP(S) Load Balancing,75.0,1.0
4,11,Setting Up HTTP(S) Load Balancing,78.0,1.0


In [122]:
merged_one.duplicated().sum()

7348

In [123]:
merged_one = merged_one.drop_duplicates()

In [124]:
merged_one.shape

(27964, 4)

In [125]:
merged_one['collaborative_score_normalized'] = (merged_one['score']-min(merged_one['score']))/(max(merged_one['score'])-min(merged_one['score']))

In [126]:
merged_one.head()

Unnamed: 0,item_no,title,consumer_no,score,collaborative_score_normalized
0,11,Setting Up HTTP(S) Load Balancing,6.0,1.0,1.0
1,11,Setting Up HTTP(S) Load Balancing,72.0,1.0,1.0
2,11,Setting Up HTTP(S) Load Balancing,43.0,1.0,1.0
3,11,Setting Up HTTP(S) Load Balancing,75.0,1.0,1.0
4,11,Setting Up HTTP(S) Load Balancing,78.0,1.0,1.0


## Hybrid : Combining the two Models

In [127]:
hybrid = pd.merge(merged_one, content_df, on = 'title' , how = 'left')

In [128]:
hybrid.head(10)

Unnamed: 0,item_no,title,consumer_no,score_x,collaborative_score_normalized,score_y,content_score_normalized
0,11,Setting Up HTTP(S) Load Balancing,6.0,1.0,1.0,0.00037,0.00039
1,11,Setting Up HTTP(S) Load Balancing,72.0,1.0,1.0,0.00037,0.00039
2,11,Setting Up HTTP(S) Load Balancing,43.0,1.0,1.0,0.00037,0.00039
3,11,Setting Up HTTP(S) Load Balancing,75.0,1.0,1.0,0.00037,0.00039
4,11,Setting Up HTTP(S) Load Balancing,78.0,1.0,1.0,0.00037,0.00039
5,11,Setting Up HTTP(S) Load Balancing,1117.0,1.0,1.0,0.00037,0.00039
6,2154,,,0.707107,0.707107,,
7,720,,,0.60833,0.60833,,
8,562,What exercise does to your bones,602.0,0.593732,0.593732,0.000318,0.000335
9,562,What exercise does to your bones,6.0,0.593732,0.593732,0.000318,0.000335


In [129]:
hybrid.shape

(28241, 7)

In [130]:
hybrid['final_score'] = (hybrid['collaborative_score_normalized'] + hybrid['content_score_normalized'])/2

In [131]:
hybrid.head()

Unnamed: 0,item_no,title,consumer_no,score_x,collaborative_score_normalized,score_y,content_score_normalized,final_score
0,11,Setting Up HTTP(S) Load Balancing,6.0,1.0,1.0,0.00037,0.00039,0.500195
1,11,Setting Up HTTP(S) Load Balancing,72.0,1.0,1.0,0.00037,0.00039,0.500195
2,11,Setting Up HTTP(S) Load Balancing,43.0,1.0,1.0,0.00037,0.00039,0.500195
3,11,Setting Up HTTP(S) Load Balancing,75.0,1.0,1.0,0.00037,0.00039,0.500195
4,11,Setting Up HTTP(S) Load Balancing,78.0,1.0,1.0,0.00037,0.00039,0.500195


In [132]:
hybrid = hybrid[['item_no','title','consumer_no','final_score']]

In [133]:
hybrid.head()

Unnamed: 0,item_no,title,consumer_no,final_score
0,11,Setting Up HTTP(S) Load Balancing,6.0,0.500195
1,11,Setting Up HTTP(S) Load Balancing,72.0,0.500195
2,11,Setting Up HTTP(S) Load Balancing,43.0,0.500195
3,11,Setting Up HTTP(S) Load Balancing,75.0,0.500195
4,11,Setting Up HTTP(S) Load Balancing,78.0,0.500195


## Final Recommendations

In [134]:
final = hybrid[['item_no','title','final_score']]

In [135]:
final.duplicated().sum()

25981

In [136]:
final = final.drop_duplicates()

In [137]:
final = final.sort_values(by = 'final_score', ascending= False)

In [138]:
# These are the top 5 recommendations 
final.head(5)

Unnamed: 0,item_no,title,final_score
0,11,Setting Up HTTP(S) Load Balancing,0.500195
15522,1704,O homofóbico dentro de cada um,0.5
14,987,Nokia vai voltar ao mercado de smartphones,0.354475
143,21,O potencial do bitcoin na América Latina,0.297856
8,562,What exercise does to your bones,0.297034


In [139]:
# These are the top top recommendations
final.head(10)

Unnamed: 0,item_no,title,final_score
0,11,Setting Up HTTP(S) Load Balancing,0.500195
15522,1704,O homofóbico dentro de cada um,0.5
14,987,Nokia vai voltar ao mercado de smartphones,0.354475
143,21,O potencial do bitcoin na América Latina,0.297856
8,562,What exercise does to your bones,0.297034
18,461,"Finally, CSS In JavaScript! Meet CSSX - Smashi...",0.254928
33,1711,Land Rover's lead engineer explains autonomous...,0.239847
837,1218,TEDxSãoPaulo,0.239752
784,1669,Pequeno guia para entender as línguas de sinais,0.238943
236,338,Campanha do Dicionário Houaiss pretende mudar ...,0.237804


## Model Evaluation

### Model Evaluation on Train Set

In [140]:
# Checking the recommendation accuracies for the train set
model_evaluation(11,hybrid)

{'recall_top_five': 0.37, 'recall_top_ten': 0.68}

### Making recommendations on the test set

In [144]:
content_output_test = item_recommendation('The Broken Window Theory')

In [145]:
content_output_test= content_output_test.drop_duplicates()

In [146]:
content_df_test = pd.DataFrame(content_output_test)

In [147]:
content_df_test.reset_index(inplace=True)

In [148]:
content_df_test.columns = ['title', 'score']

In [149]:
content_df_test['content_score_normalized'] = (content_df_test['score']-min(content_df_test['score'])) / (max(content_df_test['score']) - min(content_df_test['score']))

In [150]:
content_df_test.head()

Unnamed: 0,title,score,content_score_normalized
0,O homofóbico dentro de cada um,0.949535,1.0
1,[Books] Biblioteca FIXE: Sugestões de Leitura,0.324373,0.341612
2,"SINAIS do FUTURO imediato: #1, internet das co...",0.309699,0.326158
3,"Por dentro do Nubank, conheça os segredos da f...",0.286833,0.302077
4,A saga do jovem CEO da Dasa,0.284552,0.299675


### Collaborative Recommendation

In [169]:
item_index_test = 1951

In [170]:
item_prediction.iloc[item_index_test]

0       0.094987
1       0.000000
2       0.205196
3       0.127317
4       0.269177
          ...   
2235    0.000000
2236    0.000000
2237    0.000000
2238    0.000000
2239    0.000000
Name: 1951, Length: 2240, dtype: float64

In [171]:
item_recommendation_test = pd.DataFrame(item_prediction.iloc[item_index].sort_values(ascending=False))

In [172]:
item_recommendation_test.reset_index(inplace=True)

In [173]:
item_recommendation_test.columns = ['item_no','score']

In [174]:
merged_test = pd.merge(item_recommendation_test, merged,  on='item_no', how='left')

In [175]:
merged_test.head()

Unnamed: 0,item_no,score,interaction_type,consumer_no,title,text_description,language
0,11,1.0,5.0,6.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
1,11,1.0,5.0,72.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
2,11,1.0,5.0,43.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
3,11,1.0,5.0,75.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en
4,11,1.0,5.0,78.0,Setting Up HTTP(S) Load Balancing,HTTP(S) load balancing provides global load ba...,en


In [176]:
merged_test = merged_test.drop(['text_description','language'], axis =1)

In [179]:
merged_test['collaborative_score_normalized'] = (merged_test['score']-min(merged_test['score']))/(max(merged_test['score'])-min(merged_test['score']))

In [180]:
merged_test.head()

Unnamed: 0,item_no,score,interaction_type,consumer_no,title,collaborative_score_normalized
0,11,1.0,5.0,6.0,Setting Up HTTP(S) Load Balancing,1.0
1,11,1.0,5.0,72.0,Setting Up HTTP(S) Load Balancing,1.0
2,11,1.0,5.0,43.0,Setting Up HTTP(S) Load Balancing,1.0
3,11,1.0,5.0,75.0,Setting Up HTTP(S) Load Balancing,1.0
4,11,1.0,5.0,78.0,Setting Up HTTP(S) Load Balancing,1.0


In [181]:
hybrid_test = pd.merge(merged_test, content_df_test, on = 'title' , how = 'left')

In [182]:
hybrid_test['final_score'] = (hybrid_test['collaborative_score_normalized'] + hybrid_test['content_score_normalized'])/2

In [183]:
hybrid_test = hybrid_test[['item_no','title','consumer_no','final_score']]

### Model Evaluation on Test Set

In [186]:
# Checking the recommendation accuracies for item_no 1951 from the test set
model_evaluation(1951,hybrid_test)

{'recall_top_five': 0.35, 'recall_top_ten': 0.63}