In [288]:
import numpy as np
import pandas as pd
import xlrd
pd.options.mode.chained_assignment = None  # default='warn'
#pd.set_option('display.max_rows',6)

regUsr_dat = pd.read_excel("2018 summary.xlsx", sheet_name=0)
unregUsr_dat = pd.read_excel("2018 summary.xlsx", sheet_name=1)
articles = pd.read_excel("2018 summary.xlsx", sheet_name="Articles")

# Introduction

The dataset comes from our third party content service provider, it contains the contact information of the user that have read our website articles, as well as their activities on our website.  

In this report, I will summarize the user activities and articles read, in order to identify the main areas of interest of the users, as well as the performance of our articles. 

# User Activities Dataset

In [289]:
# registered user dataset info
regUsr_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557 entries, 0 to 556
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Ind_Title               447 non-null    object        
 1   Forename                547 non-null    object        
 2   Surname                 547 non-null    object        
 3   Email                   547 non-null    object        
 4   Company                 556 non-null    object        
 5   Industry                557 non-null    object        
 6   Country                 557 non-null    object        
 7   Job Role                557 non-null    object        
 8   Date Of Activity        557 non-null    datetime64[ns]
 9   Activity Type           557 non-null    object        
 10  Article Title           556 non-null    object        
 11  Article Primary Author  556 non-null    object        
 12  Primary Topic           556 non-null    object    

In [290]:
# select columns of interest
cols = ["Article Title", "Activity Type", "Date Of Activity", "Article Primary Author"]
regUsr_dat = regUsr_dat[cols]
regUsr_dat.head(1)

Unnamed: 0,Article Title,Activity Type,Date Of Activity,Article Primary Author
0,A Successful Trade Dress Protection Case Throu...,Article Read from Email Subscription,2018-02-26,Dan Chen


In [291]:
# unregistered user dataset info
unregUsr_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10402 entries, 0 to 10401
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Search Term         0 non-null      float64       
 1   Channel             3253 non-null   object        
 2   Article Title       10402 non-null  object        
 3   Activity Type       10402 non-null  object        
 4   Primary Author      10349 non-null  object        
 5   Date User Searched  10402 non-null  datetime64[ns]
 6   Primary Topic       10349 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 569.0+ KB


In [292]:
# select the same columns for unregistered users
unregUsr_dat.rename(columns={"Date User Searched": "Date Of Activity",
                             "Primary Author": "Article Primary Author"}, inplace=True)
unregUsr_dat = unregUsr_dat[cols]

In [293]:
# check if columns of both dataset match now
unregUsr_dat.columns == regUsr_dat.columns

array([ True,  True,  True,  True])

In [294]:
# Top 10 articles read by registered users
regUsr_dat["Article Title"].value_counts()[:10].to_frame()

Unnamed: 0,Article Title
A Successful Trade Dress Protection Case Through Copyright And Unfair Competition Dispute Litigation By Unitalen,90
Revision Of Anti-Unfair Competition Law Of China,88
E-Commerce Law of China Will Come into Force in January 2019,62
Fast Growth of China IP License Trade in the First Half of 2018,35
"Two Unitalen Cases Selected among ""The 50 Typical IP Cases"" by China Supreme Court",35
"Chinese ""Lafite"" Mark Recognized as an Unregistered Well-known Mark by Shanghai IP Court",27
"Contributor Page View from: Unitalen Obtains Recognition Of Well-Known Mark For ""SISLEY希思黎"" Through Litigation",22
SIPO Changes English Translation to CNIPA,20
BSA: China Found with the Biggest Fall in Piracy Rate,18
TRAB: Simplified Document Requirement for Trademark Refusal Review,17


In [295]:
# Top 10 articles read by unregistered users
unregUsr_dat["Article Title"].value_counts()[:10].to_frame()

Unnamed: 0,Article Title
"Two Unitalen Cases Selected among ""The 50 Typical IP Cases"" by China Supreme Court",1394
"Chinese ""Lafite"" Mark Recognized as an Unregistered Well-known Mark by Shanghai IP Court",1048
SIPO Changes English Translation to CNIPA,584
BSA: China Found with the Biggest Fall in Piracy Rate,564
Obtaining a Cambodian Patent by Registration of Chinese Patent in Cambodia,531
A Successful Trade Dress Protection Case Through Copyright And Unfair Competition Dispute Litigation By Unitalen,449
Unitalen Helped Client Achieved Settlement in Patent Infringement Suit,447
Unitalen Client In Two-Dimensional Code Patent Invalidation Administrative Litigation,443
Revision Of Anti-Unfair Competition Law Of China,438
E-Commerce Law of China Will Come into Force in January 2019,420


# Combine User Activities

In [296]:
# combine registered and unregistered users activities
df = pd.concat([regUsr_dat, unregUsr_dat], ignore_index=True)
df.head(1)

Unnamed: 0,Article Title,Activity Type,Date Of Activity,Article Primary Author
0,A Successful Trade Dress Protection Case Throu...,Article Read from Email Subscription,2018-02-26,Dan Chen


# Articles dataset

In [297]:
# articles dataset info
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Article Title      32 non-null     object
 1   Article Type       32 non-null     object
 2   Matter of Concern  32 non-null     object
dtypes: object(3)
memory usage: 896.0+ bytes


In [298]:
articles['Article Title'] = articles['Article Title'].str.strip()
df['Article Title'] = df['Article Title'].str.strip()

In [301]:
# check how many unique articles found in user activities
len(df['Article Title'].unique())

109

It's wierd there are much more unique articles in user activities dataset than the articles we have. Let's explore why.

In [303]:
# show article titles that are not found in `aritles` dataset
uniqueArticles = pd.Series(df['Article Title'].unique())
uniqueArticles[~uniqueArticles.isin(articles['Article Title'])].to_list() # limit to first 10 results

['Contributor Page View from: Unitalen Obtains Recognition Of Well-Known Mark For "SISLEY希思黎" Through Litigation',
 nan,
 'Author Page View Unitalen Attorneys At Law',
 'Redirect to your Website from: Chinese "Lafite" Mark Recognized as an Unregistered Well-known Mark by Shanghai IP Court',
 'Contributor Page View from: Chinese "Lafite" Mark Recognized as an Unregistered Well-known Mark by Shanghai IP Court',
 'Press Release View from: BSA: China Found with the Biggest Fall in Piracy Rate',
 'Contributor Page View from: Progress of SIPO Restructuring - Trademark Certificates Stamped with SIPO Seal from June 8th',
 'Contributor Page View from: Unitalen Won Plate Solar Collector Patent Infringement Litigation',
 'Author Page View Lei Zhao',
 'Contributor Page View from: WIPO Report: China‘s Innovation Capacity Continued To Improve',
 'Author Own Biography',
 'Author Own Biography from: Revision Of Anti-Unfair Competition Law Of China',
 'Contributor Page View from: China‘s Invention Pate

Now it's clear that the problem arises from activity types information being prefixed to the article titles of these records. Anyway, in this chapter, we only need to summarize user activities, so long the article titles are not missing due to data error, we don't need to worry about this at all.

# Group User Activities into Categories

In [304]:
# all unique 'activity type' after removing white space and using upper case
df["Activity Type"] = df["Activity Type"].str.strip().str.upper()
sorted(list(df["Activity Type"].unique()))

['ARTICLE',
 'ARTICLE ADDED TO POCKET',
 'ARTICLE READ FROM EMAIL SUBSCRIPTION',
 'ARTICLE VIEW FROM WEBSITE',
 'AUTHOR BIO',
 'AUTHOR PAGE VIEW',
 'CONTACT DETAILS REQUEST',
 'CONTRIBUTOR PAGE VIEW',
 'CONTRIBUTOR PAGE VIEW FROM ARTICLE',
 'FIRM WEBLINK',
 'FIRM WEBLINK FROM ARTICLE',
 'FORWARDED ARTICLE TO COLLEAGUE',
 'GOOGLE SEARCH ON AUTHOR',
 'GOOGLE SEARCH ON CONTRIBUTOR',
 'IN-ARTICLE CLICK-THROUGH TO YOUR WEBSITE',
 'PRESS RELEASE VIEW',
 'PRESS RELEASE VIEW FROM ARTICLE',
 'PRINTED ARTICLE',
 'REDIRECT TO YOUR WEBSITE FROM ARTICLE',
 'REFERRED BY COLLEAGUE TO THIS ARTICLE',
 'SEARCH OF YOUR DATA',
 'SOCIALMEDIA ARTICLE SHARE',
 'VISIT TO YOUR EVENT INFO PAGE FROM ARITCLE',
 'YOUR HOME PAGE']

In [305]:
# define category groups for activities
category = {'ARTICLE ADDED TO POCKET':'对文章表现出兴趣',
       'FORWARDED ARTICLE TO COLLEAGUE':'对文章表现出兴趣',
       'SOCIALMEDIA ARTICLE SHARE':'对文章表现出兴趣',
       'CONTACT DETAILS REQUEST':'对集佳或作者表现出兴趣',
       'PRINTED ARTICLE':'对集佳或作者表现出兴趣',
       'GOOGLE SEARCH ON AUTHOR':'对集佳或作者表现出兴趣',
       'SEARCH OF YOUR DATA':'对集佳或作者表现出兴趣',
       'GOOGLE SEARCH ON AUTHOR':'对集佳或作者表现出兴趣',
       'ARTICLE':'文章阅读',
       'REFERRED BY COLLEAGUE TO THIS ARTICLE':'文章阅读',
       'ARTICLE VIEW FROM WEBSITE':'文章阅读',
       'ARTICLE READ FROM EMAIL SUBSCRIPTION':'文章阅读',
       'AUTHOR BIO':'对集佳或作者表现出兴趣',
       'AUTHOR PAGE VIEW':'对集佳或作者表现出兴趣',
       'PRESS RELEASE VIEW':'查看我们更多文章',
       'REDIRECT TO YOUR WEBSITE FROM ARTICLE':'对集佳或作者表现出兴趣',
       'YOUR HOME PAGE':'对集佳或作者表现出兴趣',
       'VISIT TO YOUR EVENT INFO PAGE FROM ARITCLE':'对集佳或作者表现出兴趣',
       'CONTRIBUTOR PAGE VIEW':'对集佳或作者表现出兴趣',
       'FIRM WEBLINK FROM ARTICLE':'对集佳或作者表现出兴趣'
}

In [306]:
# match category group by activity type for one record
category.get(df["Activity Type"][0])

'文章阅读'

In [307]:
# match category groups by activity types for all records
result = [category.get(x) for x in df["Activity Type"]]
df["category"] = result
df.head(1)

Unnamed: 0,Article Title,Activity Type,Date Of Activity,Article Primary Author,category
0,A Successful Trade Dress Protection Case Throu...,ARTICLE READ FROM EMAIL SUBSCRIPTION,2018-02-26,Dan Chen,文章阅读


# Counts by Activity, Category Group

In [308]:
df["category"].value_counts()

文章阅读           10446
对集佳或作者表现出兴趣      400
对文章表现出兴趣           6
查看我们更多文章           1
Name: category, dtype: int64

In [309]:
count_df = df.groupby(['category', 'Activity Type']).size().to_frame()
count_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0
category,Activity Type,Unnamed: 2_level_1
对文章表现出兴趣,ARTICLE ADDED TO POCKET,2
对文章表现出兴趣,FORWARDED ARTICLE TO COLLEAGUE,3
对文章表现出兴趣,SOCIALMEDIA ARTICLE SHARE,1
对集佳或作者表现出兴趣,AUTHOR BIO,18
对集佳或作者表现出兴趣,AUTHOR PAGE VIEW,31
对集佳或作者表现出兴趣,CONTACT DETAILS REQUEST,2
对集佳或作者表现出兴趣,CONTRIBUTOR PAGE VIEW,5
对集佳或作者表现出兴趣,FIRM WEBLINK FROM ARTICLE,58
对集佳或作者表现出兴趣,GOOGLE SEARCH ON AUTHOR,4
对集佳或作者表现出兴趣,PRINTED ARTICLE,49


# Counts by Topic, Articles Types 

In [310]:
# Inner join user activities and articles datasets on `Article Title`
# only include records that have matching values in both datasets
# because this time, I'd exclude the activites without article titles
df = df.merge(articles, how='inner', on='Article Title')
df.rename(columns={'Matter of Concern':'Topic'}, inplace=True)
df.head(1)

Unnamed: 0,Article Title,Activity Type,Date Of Activity,Article Primary Author,category,Article Type,Topic
0,A Successful Trade Dress Protection Case Throu...,ARTICLE READ FROM EMAIL SUBSCRIPTION,2018-02-26,Dan Chen,文章阅读,Unitalen Cases,Trade Dress


In [311]:
df["Article Type"].value_counts().to_frame()

Unnamed: 0,Article Type
IP News,4338
Unitalen Cases,4286
Unitalen,1484
Attorney Article,390


In [314]:
groupby_cols = ['Article Type', 'Topic']

# display pivot table
pivot_tb = pd.pivot_table(df, values=['Article Title'], index=groupby_cols, 
                          aggfunc={'Article Title':['nunique', 'count']})
pivot_tb.columns = ['Clicks', 'Number of Articles']
pivot_tb

Unnamed: 0_level_0,Unnamed: 1_level_0,Clicks,Number of Articles
Article Type,Topic,Unnamed: 2_level_1,Unnamed: 3_level_1
Attorney Article,Joint Ventures,289,1
Attorney Article,Trademark Law,101,1
IP News,Anti-Unfair Competition Law,526,1
IP News,China IP System & Policy,1750,7
IP News,China-US,190,1
IP News,E-Commerce Law,482,1
IP News,Invention Patent,22,1
IP News,Patent Application,545,1
IP News,Piracy,582,1
IP News,Trademark Case,41,1
