In [1]:
import pandas as pd
import numpy as np

In [2]:
# loading the dataset from csv
raw = pd.read_csv(
    "../dataset/anonymous-msweb.data", 
    names=['attribute', 'ID', 'ignore', 'title', 'url'], 
    engine="python", 
    skiprows=7
)
raw.head()

Unnamed: 0,attribute,ID,ignore,title,url
0,A,1287,1,International AutoRoute,/autoroute
1,A,1288,1,library,/library
2,A,1289,1,Master Chef Product Information,/masterchef
3,A,1297,1,Central America,/centroam
4,A,1215,1,For Developers Only Info,/developer


In [3]:
# Filtering the data needed and printing the type of data present
data = raw[['attribute', 'ID', 'title', 'url']]
data.groupby(by='attribute').count()

Unnamed: 0_level_0,ID,title,url
attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,294,294,294
C,32711,0,0
V,98654,0,0


In [4]:
# Printing all the attribute lines
attributes = data[data['attribute'] == 'A']
attributes = attributes[['ID', 'title', 'url']]
attributes.head()

Unnamed: 0,ID,title,url
0,1287,International AutoRoute,/autoroute
1,1288,library,/library
2,1289,Master Chef Product Information,/masterchef
3,1297,Central America,/centroam
4,1215,For Developers Only Info,/developer


In [5]:
# Printing the websites and the users
websites = data[data['attribute'] == 'V']
users = data[data['attribute'] == 'C']
print("Websites")
print(websites.head())
print("\nUsers")
print(users.head())

Websites
    attribute    ID title   url
295         V  1000  None  None
296         V  1001  None  None
297         V  1002  None  None
299         V  1001  None  None
300         V  1003  None  None

Users
    attribute     ID title   url
294         C  10001  None  None
298         C  10002  None  None
301         C  10003  None  None
305         C  10004  None  None
307         C  10005  None  None


In [6]:
# Count the website visits
website_visits = websites[['ID', 'attribute']].groupby(by='ID').count()
website_visits.columns=['visits']
website_visits.sort_values(by='visits', ascending=False)
website_visits = pd.merge(website_visits, attributes, left_index=True, right_on="ID")
website_visits.sort_values(by='visits', ascending=False).head()

Unnamed: 0,visits,ID,title,url
57,10836,1008,Free Downloads,/msdownload
283,9383,1034,Internet Explorer,/ie
17,8463,1004,Microsoft.com Search,/search
287,5330,1018,isapi,/isapi
212,5108,1017,Products,/products


In [7]:
# Print the websites with 100 visits or more which will be called as popular_websites
popular_websites = website_visits[website_visits['visits'] >= 100]
print(popular_websites.shape)
popular_websites.head()

(104, 4)


Unnamed: 0,visits,ID,title,url
268,912,1000,regwiz,/regwiz
78,4451,1001,Support Desktop,/support
217,749,1002,End User Produced View,/athome
30,2968,1003,Knowledge Base,/kb
17,8463,1004,Microsoft.com Search,/search


In [8]:
# user_webvisits is the user + web visit data
user_webvisits = raw[raw['attribute'] != 'A']
print(user_webvisits.shape)
user_webvisits = user_webvisits[['attribute', 'ID']]
webvisits = []
user = ''
for row in user_webvisits.values:
    if row[0] == 'C':
        user = row[1]
    elif row[0] == 'V':
        webvisits.append([user, row[1]])
        
webvisits = pd.DataFrame(webvisits, columns=['user', 'ID'])
webvisits.head()

(131365, 5)


Unnamed: 0,user,ID
0,10001,1000
1,10001,1001
2,10001,1002
3,10002,1001
4,10002,1003


In [9]:
# Merging the user, webvisits and URL data
attributes = pd.merge(attributes, popular_websites, on="ID")
attributes = attributes[['ID', 'visits', 'title_x', 'url_x']]
attributes.columns = ['ID', 'visits', 'title', 'url']
attributes.sort_values(by='ID').head()
data = pd.merge(webvisits, attributes, on="ID")
data['values'] = 1
data.sort_values(by='user').head()

Unnamed: 0,user,ID,visits,title,url,values
0,10001,1000,912,regwiz,/regwiz,1
912,10001,1001,4451,Support Desktop,/support,1
5363,10001,1002,749,End User Produced View,/athome,1
913,10002,1001,4451,Support Desktop,/support,1
6112,10002,1003,2968,Knowledge Base,/kb,1


# Item based collaborative filtering

In [10]:
# Forming the item utility matrix
item_utility_matrix = data.pivot(index='user', columns='url', values='values').fillna(value=0)
item_utility_matrix.head()

url,/access,/accessdev,/activeplatform,/activex,/athome,/australia,/automap,/backoffice,/brasil,/canada,...,/visualc,/visualj,/vstudio,/win32dev,/windows,/windows95,/windowsce,/windowssupport,/word,/workshop
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Recommending similar items to URL '/athome'
item = item_utility_matrix['/athome']
similarItems = item_utility_matrix.corrwith(item)
similarItems.sort_values(ascending=False).head()

url
/athome            1.000000
/support           0.076860
/windowssupport    0.067837
/moneyzone         0.056557
/windows           0.050309
dtype: float64

# User based collaborative filtering

In [12]:
# Forming the user utility matrix
user_utility_matrix = item_utility_matrix.T
user_utility_matrix.head()

user,10001,10002,10003,10005,10006,10007,10008,10009,10010,10011,...,42702,42703,42704,42705,42706,42707,42708,42709,42710,42711
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/access,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/accessdev,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/activeplatform,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/activex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/athome,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Similar users to user 10011
u = 10011
user = user_utility_matrix[u]
similarUsers = user_utility_matrix.corrwith(user, axis='index').dropna().sort_values(ascending=False)
similarUsers.drop(u, inplace=True)
similarUsers.head()

user
21000    0.74000
16662    0.70014
17563    0.70014
37130    0.70014
21724    0.70014
dtype: float64

In [14]:
# Similar users recommended URLs
df = pd.DataFrame(similarUsers)
users = pd.DataFrame(user_utility_matrix[df.iloc[0].name])
users.columns=['visited']
users = users[users['visited'] == 1]
recommended_websites = pd.merge(users, popular_websites, left_index=True, right_on="url")
recommended_websites = recommended_websites.sort_values(by='visits', ascending=False)
recommended_websites.head()

Unnamed: 0,visited,visits,ID,title,url
287,1.0,5330,1018,isapi,/isapi
212,1.0,5108,1017,Products,/products
78,1.0,4451,1001,Support Desktop,/support
138,1.0,287,1016,MS Excel,/excel


In [15]:
# Recommending the website to user which he or she has not visited
website_urls = recommended_websites['url']
visited_sites = pd.DataFrame(user[user == 1])
print('Similar users urls for recommendation:', list(website_urls))
print('URLs visited by user:', list(visited_sites.index.values))
print('Recommended websites:', list(set(website_urls).difference(set(visited_sites.index.values))))


Similar users urls for recommendation: ['/isapi', '/products', '/support', '/excel']
URLs visited by user: ['/excel', '/isapi', '/mspowerpoint', '/products']
Recommended websites: ['/support']


In [16]:
# Recommend a user without history with the top visited websites
new_user_recommendations = popular_websites.sort_values(by='visits', ascending=False).head()
list(new_user_recommendations['url'])

['/msdownload', '/ie', '/search', '/isapi', '/products']