### Imports

In [1]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.utils import simple_preprocess

from nltk.stem.porter import PorterStemmer

### Load Raw Dataset

In [2]:
medium = pd.read_csv('Medium_AggregatedData.csv')
medium.head()

Unnamed: 0,audioVersionDurationSec,codeBlock,codeBlockCount,collectionId,createdDate,createdDatetime,firstPublishedDate,firstPublishedDatetime,imageCount,isSubscriptionLocked,...,slug,name,postCount,author,bio,userId,userName,usersFollowedByCount,usersFollowedCount,scrappedDate
0,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,blockchain,Blockchain,265164.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
1,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,samsung,Samsung,5708.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
2,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,it,It,3720.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
3,0,,0.0,,2018-01-07,2018-01-07 17:04:37,2018-01-07,2018-01-07 17:06:29,13,False,...,technology,Technology,166125.0,George Sykes,,93b9e94f08ca,tasty231,6.0,22.0,20181104
4,0,,0.0,,2018-01-07,2018-01-07 17:04:37,2018-01-07,2018-01-07 17:06:29,13,False,...,robotics,Robotics,9103.0,George Sykes,,93b9e94f08ca,tasty231,6.0,22.0,20181104


### Remove Duplicates
There is so much redundant info in this dataset. Kill it. Kill it with fire.

In [3]:
medium = medium[medium['language'] == 'en']         # English Only
medium = medium[medium['totalClapCount'] >= 25]     # Posts with > 25 claps

def findTags(title):
    '''
    Function extracts tags for an input title
    '''
    rows = medium[medium['title'] == title]
    tags = list(rows['tag_name'].values)
    return tags

titles = medium['title'].unique()                   # Get all the titles

tag_dict = {'title': [], 'tags': []}               # Dictionary to store tags

for title in titles:
    tag_dict['title'].append(title)
    tag_dict['tags'].append(findTags(title))

tag_df = pd.DataFrame(tag_dict)                     # Dictionary to dataframe

# Now that tag data is extracted the duplicate rows can be dropped
medium = medium.drop_duplicates(subset = 'title', keep = 'first')

def addTags(title):
    '''
    Adds tags back into medium dataframe as a list
    '''
    try:
        tags = list(tag_df[tag_df['title'] == title]['tags'])[0]
    except:
        # If there's an error assume no tags
        tags = np.NaN
    return tags

# Apply addTags
medium['allTags'] = medium['title'].apply(addTags)

# Keep only the columns we're interested in for this project
keep_cols = ['title', 'url', 'allTags', 'readingTime', 'author', 'text']
medium = medium[keep_cols]

# Drop row with null title
null_title = medium[medium['title'].isna()].index
medium.drop(index = null_title, inplace = True)

medium.reset_index(drop = True, inplace = True)

print(medium.shape)
medium.head()

(24576, 6)


Unnamed: 0,title,url,allTags,readingTime,author,text
0,"Private Business, Government and Blockchain",https://medium.com/s/story/private-business-go...,"[Blockchain, Samsung, It]",0.958491,Anar Babaev,"Private Business, Government and Blockchain\n\..."
1,Can a robot love us better than another human ...,https://medium.com/s/story/can-a-robot-love-us...,"[Robotics, Meditation, Therapy, Artificial Int...",0.65283,Stewart Alsop,Can a robot love us better than another human ...
2,"2017 Big Data, AI and IOT Use Cases",https://medium.com/s/story/2017-big-data-ai-an...,"[Artificial Intelligence, Data Science, Big Da...",7.055031,Melody Ucros,"2017 Big Data, AI and IOT Use Cases\nAn Active..."
3,The Meta Model and Meta Meta-Model of Deep Lea...,https://medium.com/s/story/the-meta-model-and-...,"[Machine Learning, Deep Learning, Artificial I...",5.684906,Carlos E. Perez,The Meta Model and Meta Meta-Model of Deep Lea...
4,Don’t trust “Do you trust this computer”,https://medium.com/s/story/dont-trust-do-you-t...,"[Artificial Intelligence, Ethics, Elon Musk, D...",2.739623,Virginia Dignum,Don’t trust “Do you trust this computer”\nfrom...


### Basic Text Cleaning

In [11]:
def clean_text(text):
    '''
    Eliminates links, non alphanumerics, and punctuation. Returns lower case text.
    '''
    
    # Remove links
    text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+','', text)
    # Remove non-alphanumerics
    text = re.sub('\w*\d\w*', ' ', text)
    # Remove punctuation and lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower())
    # Remove newline characters
    text = text.replace('\n', ' ')
    
    return text

# Clean the text
medium['text'] = medium['text'].apply(clean_text)

### Remove Stop Words

In [12]:
stop_list = STOPWORDS.union(set(['data', 'ai', 'learning', 'time', 'machine', 'like', 'use', 'new', 'intelligence', 'need', "it's", 'way',
                                 'artificial', 'based', 'want', 'know', 'learn', "don't", 'things', 'lot', "let's", 'model', 'input',
                                 'output', 'train', 'training', 'trained', 'it', 'we', 'don', 'you', 'ce', 'hasn', 'sa', 'do', 'som',
                                 'can']))

# Remove stopwords
def remove_stopwords(text):
    clean_text = []
    for word in text.split(' '):
        if word not in stop_list and (len(word) > 2):
            clean_text.append(word)
    return ' '.join(clean_text)

medium['text'] = medium['text'].apply(remove_stopwords)


### Stemming

In [13]:
# Apply stemmer to processedText
stemmer = PorterStemmer()

def stem_text(text):
    word_list = []
    for word in text.split(' '):
        word_list.append(stemmer.stem(word))
    return ' '.join(word_list)

medium['text'] = medium['text'].apply(stem_text)


### Save Current State

Stemming takes forever, so let's save our progress so far.

In [15]:
medium.to_csv('pre-processed.csv')

In [None]:
# medium = pd.read_csv('pre-processed.csv')

### Apply TFIDF and SVD

In [16]:
vectorizer = TfidfVectorizer(stop_words = stop_list, ngram_range = (1,1))
doc_word = vectorizer.fit_transform(medium['text'])

svd = TruncatedSVD(8)
docs_svd = svd.fit_transform(doc_word)


  sorted(inconsistent))


### Function to Display Topics

In [17]:
def display_topics(model, feature_names, no_top_words, no_top_topics, topic_names=None):
    count = 0
    for ix, topic in enumerate(model.components_):
        if count == no_top_topics:
            break
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", (ix + 1))
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        count += 1

display_topics(svd, vectorizer.get_feature_names(), 15, 8)


Topic  1
human, network, imag, technolog, work, user, algorithm, predict, peopl, compani, product, busi, deep, custom, develop

Topic  2
imag, layer, network, neural, function, dataset, featur, weight, convolut, vector, valu, gradient, deep, predict, paramet

Topic  3
chatbot, bot, user, custom, convers, app, messag, messeng, chat, servic, text, word, voic, assist, interact

Topic  4
imag, network, layer, neural, human, convolut, deep, chatbot, robot, neuron, technolog, cnn, brain, gan, architectur

Topic  5
imag, blockchain, tensorflow, file, python, project, api, cloud, instal, token, app, platform, code, team, googl

Topic  6
blockchain, market, valu, token, custom, layer, network, function, predict, gradient, busi, price, platform, compani, decentr

Topic  7
scienc, network, chatbot, neural, deep, scientist, layer, cours, python, neuron, gradient, skill, team, function, program

Topic  8
word, vector, blockchain, text, token, sentenc, languag, embed, network, document, nlp, neural

### Try NMF

In [20]:
nmf = NMF(8)
docs_nmf = nmf.fit_transform(doc_word)

display_topics(nmf, vectorizer.get_feature_names(), 15, 8)


Topic  1
human, robot, technolog, peopl, machin, world, think, futur, brain, job, car, autom, design, game, live

Topic  2
valu, predict, variabl, featur, regress, function, algorithm, linear, set, test, dataset, paramet, gradient, tree, distribut

Topic  3
chatbot, bot, custom, user, convers, messag, chat, servic, messeng, busi, assist, app, interact, voic, answer

Topic  4
network, layer, imag, neural, deep, convolut, neuron, weight, cnn, function, architectur, loss, gener, gan, gradient

Topic  5
file, tensorflow, imag, python, code, instal, api, run, notebook, googl, librari, creat, app, dataset, gpu

Topic  6
blockchain, market, technolog, compani, busi, custom, product, platform, servic, token, develop, industri, user, invest, team

Topic  7
scienc, scientist, cours, work, skill, team, job, peopl, project, engin, busi, analyt, program, deep, start

Topic  8
word, vector, text, sentenc, embed, languag, document, sentiment, nlp, corpu, sequenc, token, context, topic, matrix


### Try LDA

In [21]:
tokenized_docs = medium['text'].apply(simple_preprocess)
dictionary = gensim.corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Workers = 4 activates all four cores of my CPU, 
lda = models.LdaMulticore(corpus=corpus, num_topics=8, id2word=dictionary, passes=10, workers = 4)

lda.print_topics()

[(0,
  '0.034*"imag" + 0.024*"network" + 0.018*"layer" + 0.012*"neural" + 0.008*"deep" + 0.007*"dataset" + 0.007*"convolut" + 0.007*"detect" + 0.007*"function" + 0.007*"object"'),
 (1,
  '0.010*"user" + 0.010*"code" + 0.008*"python" + 0.007*"run" + 0.007*"file" + 0.007*"app" + 0.006*"api" + 0.006*"build" + 0.005*"project" + 0.005*"tool"'),
 (2,
  '0.008*"custom" + 0.008*"user" + 0.008*"human" + 0.007*"product" + 0.007*"chatbot" + 0.007*"scienc" + 0.007*"peopl" + 0.007*"busi" + 0.006*"design" + 0.006*"experi"'),
 (3,
  '0.009*"human" + 0.008*"peopl" + 0.006*"world" + 0.005*"think" + 0.005*"year" + 0.005*"robot" + 0.004*"technolog" + 0.004*"don" + 0.004*"day" + 0.003*"car"'),
 (4,
  '0.014*"valu" + 0.011*"function" + 0.011*"predict" + 0.010*"algorithm" + 0.007*"problem" + 0.007*"probabl" + 0.006*"distribut" + 0.006*"optim" + 0.006*"state" + 0.006*"result"'),
 (5,
  '0.013*"featur" + 0.013*"word" + 0.010*"valu" + 0.008*"dataset" + 0.008*"number" + 0.007*"vector" + 0.007*"predict" + 0.006*

### Save NMF Topics
And concatenate topic data back to other metadata. Also remove articles with all 0 topic distributions.

In [32]:
# Define column names for dataframe
column_names = ['title', 'url', 'allTags', 'readingTime', 'author', 'Tech',
                'Modeling', 'Chatbots', 'Deep Learning', 'Coding', 'Business',
                'Careers', 'NLP', 'sum']

# Create topic sum for each article. Later remove all articles with sum 0.
topic_sum = pd.DataFrame(np.sum(docs_nmf, axis = 1))

# Turn our docs_nmf array into a data frame
doc_topic_df = pd.DataFrame(data = docs_nmf)

# Merge all of our article metadata and name columns
doc_topic_df = pd.concat([medium[['title', 'url', 'allTags', 'readingTime', 'author']], doc_topic_df, topic_sum], axis = 1)

doc_topic_df.columns = column_names

# Remove articles with topic sum = 0, then drop sum column
doc_topic_df = doc_topic_df[doc_topic_df['sum'] != 0]

doc_topic_df.drop(columns = 'sum', inplace = True)

# Reset index then save
doc_topic_df.reset_index(drop = True, inplace = True)
doc_topic_df.to_csv('tfidf_nmf_8topics.csv', index = False)
doc_topic_df.head()

In [3]:
# doc_topic_df = pd.read_csv('tfidf_nmf_8topics.csv')

### Recommendation Engine

In [73]:
topic_names = ['Tech', 'Modeling', 'Chatbots', 'Deep Learning', 'Coding', 'Business', 'Careers', 'NLP']
topic_array = np.array(doc_topic_df[topic_names])
norms = np.linalg.norm(topic_array, axis = 1)

def compute_dists(top_vec, topic_array):
    '''
    Returns cosine distances for top_vec compared to every article
    '''
    dots = np.matmul(topic_array, top_vec)
    input_norm = np.linalg.norm(top_vec)
    co_dists = dots / (input_norm * norms)
    return co_dists

def produce_rec(top_vec, topic_array, doc_topic_df, rand = 15):
    '''
    Produces a recommendation based on cosine distance.
    Rand variable controls level of randomness in output recommendation.
    '''
    # Add a bit of randomness to top_vec
    top_vec = top_vec + np.random.rand(8,)/(np.linalg.norm(top_vec)) * rand
    co_dists = compute_dists(top_vec, topic_array)
    return doc_topic_df.loc[np.argmax(co_dists)]

### Test Against User Input

In [78]:
tech = 5
modeling = 5
chatbots = 0
deep = 0
coding = 0
business = 5
careers = 0
nlp = 0

top_vec = np.array([tech, modeling, chatbots, deep, coding, business, careers, nlp])

rec = produce_rec(top_vec, topic_array, doc_topic_df)
rec

title            Can Brain Activity Predict Consumers’ Preferen...
url              https://medium.com/s/story/can-brain-activity-...
allTags          [Neuroscience, Behavioral Economics, Consumer ...
readingTime                                                4.90503
author                                                 Looxid Labs
Tech                                                     0.0171501
Modeling                                                 0.0239323
Chatbots                                                         0
Deep Learning                                           0.00719324
Coding                                                           0
Business                                                 0.0227948
Careers                                                          0
NLP                                                     0.00131324
Name: 12370, dtype: object