In [1]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

In [2]:
import pandas as pd

df = pd.DataFrame({
    'doc_id': list(map(lambda filename: '/'.join(filename.split('/')[-2:]), newsgroups_train.filenames)),
    'topic': list(map(lambda topic_id: newsgroups_train.target_names[topic_id], newsgroups_train.target)),
    'text': newsgroups_train.data,
})

df = df.set_index('doc_id')

df

Unnamed: 0_level_0,topic,text
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
rec.autos/102994,rec.autos,From: lerxst@wam.umd.edu (where's my thing)\nS...
comp.sys.mac.hardware/51861,comp.sys.mac.hardware,From: guykuo@carson.u.washington.edu (Guy Kuo)...
comp.sys.mac.hardware/51879,comp.sys.mac.hardware,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
comp.graphics/38242,comp.graphics,From: jgreen@amber (Joe Green)\nSubject: Re: W...
sci.space/60880,sci.space,From: jcm@head-cfa.harvard.edu (Jonathan McDow...
talk.politics.guns/54525,talk.politics.guns,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...
sci.med/58080,sci.med,From: bmdelane@quads.uchicago.edu (brian manni...
comp.sys.ibm.pc.hardware/60249,comp.sys.ibm.pc.hardware,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...
comp.os.ms-windows.misc/10008,comp.os.ms-windows.misc,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...
comp.sys.mac.hardware/50502,comp.sys.mac.hardware,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...


In [3]:
from nltk.tokenize import TweetTokenizer
tweet_tok = TweetTokenizer()

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean_text(text):
    # TODO: remove metadata
    # TODO: Rake
    return [
        stemmer.stem(word.lower())
        for word in tweet_tok.tokenize(text)
        if word.isalpha() and not word in stop_words
    ]

In [4]:
sample = list(df['text'].head(2))
list(map(clean_text, sample))

[['from',
  'thing',
  'subject',
  'what',
  'car',
  'organ',
  'univers',
  'maryland',
  'colleg',
  'park',
  'line',
  'i',
  'wonder',
  'anyon',
  'could',
  'enlighten',
  'car',
  'i',
  'saw',
  'day',
  'it',
  'door',
  'sport',
  'car',
  'look',
  'late',
  'earli',
  'it',
  'call',
  'bricklin',
  'the',
  'door',
  'realli',
  'small',
  'in',
  'addit',
  'front',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'thi',
  'i',
  'know',
  'if',
  'anyon',
  'tellm',
  'model',
  'name',
  'engin',
  'spec',
  'year',
  'product',
  'car',
  'made',
  'histori',
  'whatev',
  'info',
  'funki',
  'look',
  'car',
  'pleas',
  'thank',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['from',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summari',
  'final',
  'call',
  'si',
  'clock',
  'report',
  'keyword',
  'si',
  'acceler',
  'clock',
  'upgrad',
  'd',
  'shelley',
  'organ',
  'univers',
  'washington',
  'line',
  'a',
  

In [5]:
df['text_clean'] = df['text'].apply(clean_text).apply(lambda x: ' '.join(x))

In [6]:
df

Unnamed: 0_level_0,topic,text,text_clean
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rec.autos/102994,rec.autos,From: lerxst@wam.umd.edu (where's my thing)\nS...,from thing subject what car organ univers mary...
comp.sys.mac.hardware/51861,comp.sys.mac.hardware,From: guykuo@carson.u.washington.edu (Guy Kuo)...,from guy kuo subject si clock poll final call ...
comp.sys.mac.hardware/51879,comp.sys.mac.hardware,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,from thoma e willi subject pb question organ p...
comp.graphics/38242,comp.graphics,From: jgreen@amber (Joe Green)\nSubject: Re: W...,from jgreen joe green subject re weitek organ ...
sci.space/60880,sci.space,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,from jonathan mcdowel subject re shuttl launch...
talk.politics.guns/54525,talk.politics.guns,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,from foxvog dougla subject re reword second am...
sci.med/58080,sci.med,From: bmdelane@quads.uchicago.edu (brian manni...,from brian man delaney subject brain tumor tre...
comp.sys.ibm.pc.hardware/60249,comp.sys.ibm.pc.hardware,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,from grubb subject re ide vs scsi organ new me...
comp.os.ms-windows.misc/10008,comp.os.ms-windows.misc,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,from subject win icon help pleas organ univers...
comp.sys.mac.hardware/50502,comp.sys.mac.hardware,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...,from stan kerr subject re sigma design doubl d...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])

print(X.shape)
print(vectorizer.get_feature_names())

(11314, 56567)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X, X)

In [9]:
import numpy as np

def recommend_articles(doc_id, top_n=10, similarity_matrix=similarity_matrix):
    indexed_doc_ids = pd.Series(df.index)

    matched_row_number = indexed_doc_ids[indexed_doc_ids == doc_id].index[0]

    matched_row = pd.Series(similarity_matrix[matched_row_number])

    sorted_recommendations = matched_row.sort_values(ascending=False)

    top_n_recommendations = sorted_recommendations[1:(top_n + 1)]
    
    return pd.DataFrame({
        'doc_id': indexed_doc_ids[top_n_recommendations.index].values,
        'similarity': top_n_recommendations.values
    }).set_index('doc_id')

In [10]:
recommend_articles('comp.sys.mac.hardware/51861', top_n=20)

Unnamed: 0_level_0,similarity
doc_id,Unnamed: 1_level_1
comp.sys.mac.hardware/51695,0.648966
comp.sys.mac.hardware/51674,0.440609
comp.sys.mac.hardware/51560,0.433631
comp.sys.mac.hardware/51920,0.374815
comp.sys.mac.hardware/51642,0.273637
comp.sys.mac.hardware/51906,0.269982
comp.sys.mac.hardware/51708,0.247467
comp.sys.mac.hardware/51747,0.214782
comp.sys.mac.hardware/51745,0.195968
comp.sys.mac.hardware/50551,0.189666
