# Cleanup
The purpose of this notebook is to perform various check and cleanup on our refined dataset of papers and also the authors datset. The result is tw cleaned datasets that are saveed in the `processed subdirectory.

In [None]:
import swifter

import os
import json
import time
import re
from datetime import datetime

from collections import defaultdict

import string

import matplotlib.pyplot as plt

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
from matplotlib.pylab import plt
import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
# nltk.download('words')
from nltk.stem import WordNetLemmatizer

import enchant

import seaborn as sns

sns.set_context('paper')

!pwd


# Setup

## Datasets and files

In [None]:
papers_dataset = '../data/processed/2100_refined_recsys_papers.feather'
authors_dataset = '../data/raw/2000_recsys_authors.feather'

clean_papers_dataset = '../data/processed/2200_recsys_papers_cleaned.feather'
clean_authors_dataset = '../data/processed/2200_recsys_authors_cleaned.feather'



## Load the main papers/authors

In [None]:
papers_df = pd.read_feather(papers_dataset)
papers_df.shape, papers_df['is_recsys_paper'].sum()

In [None]:
authors_df = pd.read_feather(authors_dataset)
authors_df.shape

In [None]:
# Duplicates check.
papers_df = papers_df.drop_duplicates(subset=['paperId'])
papers_df

# Check for (and repair) missing author papers
One of the issues I have noticed is that the authors dataframe does not have a complete account of publications. There are some papers that can be found in the papers dataframe with a given author that are missing from that authors paper list. We can repair this (at least in part).

In [None]:
papers_and_authors_df = (
    papers_df
    .set_index('paperId')['authors']
    .explode().dropna().reset_index()
    .rename(columns={'authors': 'authorId'})
    .drop_duplicates()
    .groupby('authorId')['paperId']
    .apply(lambda g: np.unique(g.values))
)


authors_df = authors_df.set_index('authorId')
authors_df['alt_papers'] = papers_and_authors_df
authors_df['alt_papers'] = authors_df['alt_papers'].map(
    lambda papers: papers if type(papers) is np.ndarray else []
)

authors_df = authors_df.reset_index('authorId')

authors_df

In [None]:

authors_df['updated_papers'] = authors_df.swifter.apply(
    lambda author: np.unique(np.concatenate([author['papers'], author['alt_papers']])),
    axis=1
)

authors_df['num_papers'] = authors_df['updated_papers'].map(len)

authors_df

In [None]:
authors_df['paperCount'].sum(), authors_df['papers'].map(len).sum(), authors_df['num_papers'].sum()

# Check (and repair) the author lists of papers
A slghtly different issue is apparent for papers in that some papers list duplicate authors. We folow the same approach as above to resolves this.

In [None]:
authors_and_papers_df = (
    authors_df
    .set_index('authorId')['papers']
    .explode().dropna().reset_index()
    .rename(columns={'papers': 'paperId'})
    .drop_duplicates()
    .groupby('paperId')['authorId']
    .apply(lambda g: np.unique(g.values))
)

papers_df = papers_df.set_index('paperId')
papers_df['alt_authors'] = authors_and_papers_df
papers_df['alt_authors'] = papers_df['alt_authors'].swifter.apply(
    lambda authors: authors if type(authors) is np.ndarray else list()
)

papers_df = papers_df.reset_index('paperId')

papers_df

In [None]:
papers_df['updated_authors'] = papers_df.swifter.apply(
    lambda paper: np.unique(np.concatenate([paper['authors'], paper['alt_authors']])),
    axis=1
)

# Fix the author order so that it matches the original order.
papers_df['updated_authors'] = papers_df.swifter.apply(
    lambda row: [author for author in row['authors'] if author in row['updated_authors']], 
    axis=1
)

papers_df['num_authors'] = papers_df['updated_authors'].map(len)

papers_df.filter(like='authors')

In [None]:
# There are some duplicare authors in the original lists so we are effectiveoy removing these, hence why the author count drops.
papers_df['authorCount'].sum(), papers_df['authors'].map(len).sum(), papers_df['num_authors'].sum()

# Add author names to papers for convenience
Normally we will be dealing with author ids but it is useful to have the corresponding names from the authors dataset in th emain papers dataset.

In [None]:
authors_df = authors_df.set_index('authorId')
all_author_ids = set(authors_df.index)

papers_df['author_names'] = papers_df['updated_authors'].swifter.apply(
    lambda author_ids: [
        authors_df.loc[author_id]['name'] 
        for author_id in author_ids 
        if author_id in all_author_ids
    ]
)

authors_df = authors_df.reset_index()

papers_df.filter(like='authors')

# Clean Venues
This is an attempt to clean up some of the messiness that is the venues column. We produce a `clean_venue` column which includes a simplified venue text which facilitates additional matches. It doesn't have a massive effect but it is straightward to do and means that >90% of our papers can be associated with a venue that has >15 papers. This means that <10% of papers are associated with a venue with fewer papers. This could be due to inconsistencies with the way venues are coded or they could just be very small or once-off venues.

## Simplify venue text

In [None]:
def remove_punctuation(text):

    # Add the single quote and drop the hyphen
    punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~' + "’"

    # Create a translation table mapping punctuation characters to None
    translator = str.maketrans('', '', punctuation)
    
    # Remove punctuation using translate method
    return text.translate(translator)

def remove_four_digit_years(text):
   
    # Regular expression pattern to match four-digit years
    pattern = r'\b\d{4}\b'
    
    # Replace all occurrences of the pattern with an empty string
    
    cleaned_text = re.sub(pattern, '', text)
    
    # Remove any extra spaces that may have been left
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

def remove_ordinals(text):
    """
    Removes all ordinal numbers from the given string.

    Args:
    text (str): The input string.

    Returns:
    str: The string with all ordinal numbers removed.
    """
    # Regular expression pattern to match ordinal numbers
    pattern = r'\b\d+(?:st|nd|rd|th)\b'
    # Replace all occurrences of the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    # Remove any extra spaces that may have been left
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


STOPWORDS = list(STOPWORDS) + ['proceedings']

def remove_stopwords(text, stopwords=STOPWORDS):
    return ' '.join([
        word 
        for word in text.split() 
        if (word not in STOPWORDS) & (not(word.isdigit())) & (len(word)>2)
    ])


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


def clean_venue(venue):

    if venue == '': return np.nan

    venue = venue.lower()

    venue = remove_punctuation(venue)

    venue = remove_ordinals(venue)

    venue = remove_four_digit_years(venue)

    venue = remove_stopwords(venue)

    venue = lemmatize(venue)

    venue = venue.replace('international ', '')

    return venue


papers_df['clean_venue'] = papers_df['venue'].swifter.apply(clean_venue)
papers_df

## Swap venue titles for abbreviations for popular venues
This is useful later when we want to produce graphs with venues in the axis. Its avoids very long text strings.

In [None]:
def fix_venue(df, from_venue, to_venue):

    return np.where(
        df['clean_venue']==from_venue, 
        to_venue, 
        df['clean_venue']
    )

venue_swaps = [

    ('annual acm sigir conference research development information retrieval', 'sigir'),
    ('conference information knowledge management', 'cikm'),
    ('user modeling adaptation personalization', 'umap'),
    ('knowledge discovery data mining', 'kdd'),
    ('web search data mining', 'wsdm'),
    ('aaai conference artificial intelligence', 'aaai'),
    ('joint conference artificial intelligence', 'ijcai'),
    ('user modeling user-adapted interaction', 'umuai'),
    ('ieee transaction knowledge data engineering', 'tkde'),
    ('conference intelligent user interface', 'iui'),
    ('european conference information retrieval', 'ecir'),
    ('industrial conference data mining', 'icdm'),
    ('florida research society', 'flairs'),
    ('conference web information system technology', 'wist'),
    ('ieee joint conference neural network', 'ijcnn'),
    ('applied intelligence boston', 'applied intelligence'),
    ('ieee conference big data big data', 'ieee big data'),
    ('ieee conference big data', 'ieee big data'),
    ('web conference', 'web'),
    ('journal research applied science engineering technology', 'ijraset'),
    ('conference electronic commerce web technology', 'ecweb'),
    ('acm symposium applied computing', 'sac'),
    ('expert system application', 'expert systems with applications'),
    ('knowledge-based system', 'kbs'),
    ('italian information retrieval workshop', 'iir'),
    ('journal physic conference series', 'jpcs'),
    ('multimedia tool application', 'multimedia tools and applications'),

    ('neural information processing system', 'nips'),
    ('conference machine learning', 'icml'),
    ('annual meeting association computational linguistics', 'acl'),
    ('plo one', 'plos one'),
    ('conference human factor computing system', 'sigchi'),
    ('conference empirical method natural language processing', 'emnlp'),
    ('ieee conference acoustic speech signal processing', 'icassp'),
    ('social science research network', 'ssrn'),
    ('computer vision pattern recognition', 'cvpr'),
    ('concurrency computation', 'ccpe'),
    ('ieee conference system man cybernetics', 'scm'),
    ('global communication conference', 'globecom'),
    ('neural computing application print', 'neural computing and applications'),
    ('ieee transaction vehicular technology', 'trans vehicular technology'),
    ('ieee internet thing journal', 'iotj'),
    ('italian national conference sensor', 'sensors'),
    ('conference learning representation', 'iclr'),
    ('ieee conference data engineering', 'icde'),
    
    
    
    ('chi extended abstract', 'sigchi'),
    ('chi conference companion', 'sigchi'),
    ('sigchi conference human factor computing system', 'sigchi'),
    ('extended abstract chi conference human factor computing system', 'sigchi'),
    ('chi extended abstract human factor computing system', 'sigchi'),
    ('chi conference human factor computing system', 'sigchi'),
    
    ('communication acm', 'cacm'),
    ('acm cacm', 'cacm'),
    
    ('acm trans interact intell syst', 'transaction interactive intelligent system'),
    
    ('acm trans inf syst', 'toism'),
    ('transaction information system', 'tois'),
    
    ('acm trans internet techn', 'toit'),
    ('transaction internet technology', 'toit'),
    
    ('acm trans multim comput commun appl', 'tomm'),
    
    ('acm transaction web', 'tweb'),
    
    ('web information system engineering', 'wise'),

    ('siam data mining', 'sdm'),

    ('acm transaction knowledge discovery data', 'tkdd'),
    ('transaction knowledge discovery data', 'tkdd'),

    ('acm transaction recommender system', 'tors'),
    ('transaction recommender system', 'tors'),

    ('acm conference recommender system', 'acm recsys'),
    ('conference recommender system', 'acm recsys'),
    
]

for venue_swap in venue_swaps:
    papers_df['clean_venue'] = fix_venue(papers_df, *venue_swap)

papers_df

# Combine/Update Citations
Replace the citations of papers with missing citations with the recently scraped citations.

## Produce an `updated_citations` column
This column stores the best estimate of the citations we have for a paper. Either its the original set of citations that were produced in the original crawl of papers or it is based on the citations that were later separately scraped because the original citations were missing or grossly incomplete.

Note, we cannot treat the original citation count as true as the updated citations often exceed it.

In [None]:
with_orig_citations = papers_df['citations'].notnull()
with_scraped_citations = papers_df['scraped_citations'].notnull()


# If we have original and scraped citations then get the union.
papers_df.loc[with_orig_citations & with_scraped_citations, 'updated_citations'] = (
    papers_df[with_orig_citations & with_scraped_citations]
    .swifter.apply(lambda row: list(np.union1d(row['citations'], row['scraped_citations'])), axis=1)
)

papers_df.loc[with_orig_citations & ~with_scraped_citations, 'updated_citations'] = papers_df[with_orig_citations & ~with_scraped_citations]['citations']
papers_df.loc[~with_orig_citations & with_scraped_citations, 'updated_citations'] = papers_df[~with_orig_citations & with_scraped_citations]['scraped_citations']

papers_df['orig_citation_count'] = papers_df['citations'].map(len)
papers_df['updated_citation_count'] = papers_df['updated_citations'].map(len)

papers_df[['citationCount', 'orig_citation_count', 'updated_citation_count']].sum()

## Get the years associated with each citation
That is, get the year of the citing paper and add as a new column. Each value in this new column will be a year and the sequence of years will correspond to the sequence of citations in the main citations column.

In [None]:
papers_df_by_paper_id = papers_df.set_index('paperId')

def is_number(string):
    """
    Checks if the given string is an integer or a real number.

    Args:
    string (str): The input string.

    Returns:
    bool: True if the string is an integer or a real number, False otherwise.
    """
    # Regular expression pattern to match an integer or a real number
    pattern = r'^\d+(\.\d+)?$'
    # Use fullmatch to ensure the entire string matches the pattern
    return bool(re.fullmatch(pattern, string))


def get_citation_years(paper_ids):

    if len(paper_ids) == 0: 
        return []

    years = papers_df_by_paper_id.reindex(paper_ids)['year'].values

    # Remove any missing years; there will be a few but usually a very small fraction only.
    return years[~np.isnan(years)]


papers_df['citation_years'] = papers_df['updated_citations'].swifter.apply(get_citation_years)

papers_df

# Add Publication Years to the Authors DF
Similar to above for citations but it is the publication year of the author publications in the authors dataset.

## Add publication years

In [None]:
authors_df['publication_years'] = authors_df['updated_papers'].swifter.apply(get_citation_years)
authors_df.head()

## Mark the RecSys Papers for authors

In [None]:
recsys_paper_ids = set(papers_df[papers_df['is_recsys_paper']]['paperId'].unique())
len(recsys_paper_ids)


In [None]:
authors_df['recsys_publications'] = authors_df['updated_papers'].swifter.apply(
    lambda papers: [paper for paper in papers if paper in recsys_paper_ids]
)

authors_df['num_recsys_publications'] = authors_df['recsys_publications'].map(len)

authors_df['recsys_publication_years'] = authors_df['recsys_publications'].swifter.apply(get_citation_years)

authors_df.head()

# Mark the English Papers
Not all papers are in English. Here is a simple approach to estimating whether a paper is written in English based on its title/abstract.

In [None]:
def remove_punctuation(text):

    # Add the single quote and drop the hyphen
    punctuation = string.punctuation

    # Create a translation table mapping punctuation characters to None
    translator = str.maketrans('', '', punctuation)
    
    # Remove punctuation using translate method
    return text.translate(translator)
    

def get_english_words(text, d=enchant.Dict("en")):

    ok_words = set(['recsys', 'recommender', 'movielens', 'grouplens', 'dnn', 'cnn', 'ann', 'cbr', 'ml', 'ai'])

    clean_words = remove_punctuation(text.replace('-', ' ')).lower().split()

    return [word for word in clean_words if (word in ok_words) | d.check(word)], clean_words

def frac_english_words(text):
    english_words, clean_words = get_english_words(text)

    if len(clean_words):
        return len(english_words)/len(clean_words)
    else:
        return 0


min_frac_english_words = 0.66

has_english_title = (
    papers_df['title']
    .map(lambda text: text.lower())
    .swifter.apply(frac_english_words)>=min_frac_english_words
)

has_english_abstract = (
    papers_df['text']
    .map(lambda text: text.lower())
    .swifter.apply(frac_english_words)>=min_frac_english_words
)



papers_df['has_english_title'] = has_english_title
papers_df['has_english_abstract'] = has_english_abstract

has_english_title.mean(), has_english_abstract.mean()

# Generate Understandable Tokens
More tokenization experiments. We tokenize and stem the titles and abstract without removing stopwords. We can do this later if needed but, for example, BERT doesn't want that. Also we reverse/invert the stemmed tokens to restore plausible word (based on mapping frequency) so that we can generate wordclouds and topics out of full words.

In [None]:
# stop_words = set(stopwords.words('english'))

stemmer = nltk.SnowballStemmer(language='english', ignore_stopwords=True)
stemmer = nltk.LancasterStemmer()
stemmer = nltk.PorterStemmer()

lemmatizer = nltk.wordnet.WordNetLemmatizer()

def extract_tokens(text):
    
    word_list = remove_punctuation(text.replace('-', ' ').lower()).split()
    
    tokens = [create_token(word) for word in word_list]
    
    return [token for token in tokens]


token_map = {}

def create_token(word):
    
    token = stemmer.stem(lemmatizer.lemmatize(word))

    # If the token exists then check the words producing it.
    if token in token_map:
        word_counts = token_map[token]

        # If the word already exists then update its count.
        if word in word_counts:
            word_counts[word] = word_counts[word]+1
            
        # Otherwise add a new count.
        else:
            word_counts[word] = 1

        # Update the token map.
        token_map[token] = word_counts

    # If there is no token then add a new one with a new word count.
    else:
        token_map[token] = {word: 1}

    return token

def reverse_tokens(tokens):

    reversed_tokens = []

    for token in tokens:
        word_counts = token_map[token]
        reversed_tokens.append(sorted(word_counts.keys(), key=lambda key: word_counts[key], reverse=True)[0])

    return reversed_tokens
        
extract_tokens('Narrative Editing of Web Contexts on Online Community System with Avatar-like Agents')

In [None]:
papers_df['title_tokens'] = papers_df['title'].swifter.apply(extract_tokens)
papers_df['title_tokens_as_string'] = papers_df['title_tokens'].swifter.apply(lambda tokens: ', '.join(tokens))

papers_df['reversed_title_tokens'] = papers_df['title_tokens'].swifter.apply(reverse_tokens)
papers_df['reversed_title_tokens_as_string'] = papers_df['reversed_title_tokens'].swifter.apply(lambda tokens: ', '.join(tokens))

papers_df.head()

In [None]:
papers_df['text_tokens'] = papers_df['text'].swifter.apply(extract_tokens)
papers_df['text_tokens_as_string'] = papers_df['text_tokens'].swifter.apply(lambda tokens: ', '.join(tokens))

papers_df['reversed_text_tokens'] = papers_df['text_tokens'].swifter.apply(reverse_tokens)
papers_df['reversed_text_tokens_as_string'] = papers_df['reversed_text_tokens'].swifter.apply(lambda tokens: ', '.join(tokens))

papers_df[['title', 'title_tokens_as_string', 'reversed_title_tokens_as_string']].head(20).values

# Save the Clean Datasets
The final cleaned datasets. In the end we have 58,800 core RS papers within th elarger dataset of papers.

In [None]:
papers_df.to_feather(clean_papers_dataset)
authors_df.to_feather(clean_authors_dataset)

(
    papers_df.shape, clean_papers_dataset, 
    authors_df.shape, clean_authors_dataset,

    papers_df['is_recsys_paper'].sum(), papers_df['is_core_recsys_paper'].sum()
)