In [2]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
import pandas as pd

df = pd.DataFrame({
    'doc_id': list(map(lambda filename: '/'.join(filename.split('/')[-2:]), newsgroups_train.filenames)),
    'topic': list(map(lambda topic_id: newsgroups_train.target_names[topic_id], newsgroups_train.target)),
    'text': newsgroups_train.data,
})

df = df.set_index('doc_id')

df

Unnamed: 0_level_0,topic,text
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
rec.autos/102994,rec.autos,From: lerxst@wam.umd.edu (where's my thing)\nS...
comp.sys.mac.hardware/51861,comp.sys.mac.hardware,From: guykuo@carson.u.washington.edu (Guy Kuo)...
comp.sys.mac.hardware/51879,comp.sys.mac.hardware,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
comp.graphics/38242,comp.graphics,From: jgreen@amber (Joe Green)\nSubject: Re: W...
sci.space/60880,sci.space,From: jcm@head-cfa.harvard.edu (Jonathan McDow...
talk.politics.guns/54525,talk.politics.guns,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...
sci.med/58080,sci.med,From: bmdelane@quads.uchicago.edu (brian manni...
comp.sys.ibm.pc.hardware/60249,comp.sys.ibm.pc.hardware,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...
comp.os.ms-windows.misc/10008,comp.os.ms-windows.misc,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...
comp.sys.mac.hardware/50502,comp.sys.mac.hardware,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...


In [4]:
from nltk.tokenize import TweetTokenizer
tweet_tok = TweetTokenizer()

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def clean_text(text):
    # TODO: remove metadata
    # TODO: Rake
    return [
        word.lower()
        for word in tweet_tok.tokenize(text)
        if word.isalpha() and not word in stop_words
    ]

In [5]:
sample = list(df['text'].head(2))
list(map(clean_text, sample))

[['from',
  'thing',
  'subject',
  'what',
  'car',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  'i',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'i',
  'saw',
  'day',
  'it',
  'door',
  'sports',
  'car',
  'looked',
  'late',
  'early',
  'it',
  'called',
  'bricklin',
  'the',
  'doors',
  'really',
  'small',
  'in',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'this',
  'i',
  'know',
  'if',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['from',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summary',
  'final',
  'call',
  'si',
  'clock',
  'reports',
  'keywords',
  'si',
  'acceleration',
  'clock',
  'upgrade',
  'd',
  'shelley',
  'orga

In [6]:
df['text_clean'] = df['text'].apply(clean_text).apply(lambda x: ' '.join(x))

In [7]:
df

Unnamed: 0_level_0,topic,text,text_clean
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rec.autos/102994,rec.autos,From: lerxst@wam.umd.edu (where's my thing)\nS...,from thing subject what car organization unive...
comp.sys.mac.hardware/51861,comp.sys.mac.hardware,From: guykuo@carson.u.washington.edu (Guy Kuo)...,from guy kuo subject si clock poll final call ...
comp.sys.mac.hardware/51879,comp.sys.mac.hardware,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,from thomas e willis subject pb questions orga...
comp.graphics/38242,comp.graphics,From: jgreen@amber (Joe Green)\nSubject: Re: W...,from jgreen joe green subject re weitek organi...
sci.space/60880,sci.space,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,from jonathan mcdowell subject re shuttle laun...
talk.politics.guns/54525,talk.politics.guns,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,from foxvog douglas subject re rewording secon...
sci.med/58080,sci.med,From: bmdelane@quads.uchicago.edu (brian manni...,from brian manning delaney subject brain tumor...
comp.sys.ibm.pc.hardware/60249,comp.sys.ibm.pc.hardware,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,from grubb subject re ide vs scsi organization...
comp.os.ms-windows.misc/10008,comp.os.ms-windows.misc,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,from subject win icon help please organization...
comp.sys.mac.hardware/50502,comp.sys.mac.hardware,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...,from stan kerr subject re sigma designs double...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])

print(X.shape)
print(vectorizer.get_feature_names())

(11314, 74955)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X, X)

In [37]:
import numpy as np

def recommend_articles(doc_id, top_n=10, similarity_matrix=similarity_matrix):
    indexed_doc_ids = pd.Series(df.index)

    matched_row_number = indexed_doc_ids[indexed_doc_ids == doc_id].index[0]

    matched_row = pd.Series(similarity_matrix[matched_row_number])

    sorted_recommendations = matched_row.sort_values(ascending=False)

    top_n_recommendations = sorted_recommendations[1:(top_n + 1)]
    
    return pd.DataFrame({
        'doc_id': indexed_doc_ids[top_n_recommendations.index].values,
        'similarity': top_n_recommendations.values
    }).set_index('doc_id')

In [38]:
recommend_articles('comp.sys.mac.hardware/51861', top_n=20)

6399    0.559158
1270    0.352043
4693    0.337164
9130    0.336613
9921    0.229659
2116    0.195919
8265    0.195876
5097    0.184398
9270    0.176560
5509    0.169636
5541    0.169029
2459    0.142756
6772    0.134662
685     0.131760
704     0.130924
450     0.125063
7246    0.123891
3035    0.122970
7284    0.122918
1501    0.121888
dtype: float64


Unnamed: 0_level_0,similarity
doc_id,Unnamed: 1_level_1
comp.sys.mac.hardware/51695,0.559158
comp.sys.mac.hardware/51560,0.352043
comp.sys.mac.hardware/51920,0.337164
comp.sys.mac.hardware/51674,0.336613
comp.sys.mac.hardware/51708,0.229659
comp.sys.mac.hardware/51747,0.195919
comp.sys.mac.hardware/51906,0.195876
comp.sys.mac.hardware/51745,0.184398
comp.sys.mac.hardware/51642,0.17656
comp.sys.mac.hardware/51895,0.169636
