In [1]:
######################### Import #########################
import os
import re
import string
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [2]:
######################### Data #########################
os.chdir('/Users/biancaorozco/Desktop/Metis/project4/medium/data/')
df = pd.read_csv('Medium_Clean.csv').sample(250000)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
######################### Functions #########################
def space(string):
    
    return string + ' '


def add_space(series):
    new_series = pd.Series([])
    for string in series:
        new_string = pd.Series([space(string)])
        new_series = new_series.append(new_string)

    return new_series      
     
    
# Preprocessing Text
def preprocessing(Series):
    new_series = pd.Series([])
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

    new_series = Series.map(alphanumeric).map(punc_lower)
    
    return new_series


def top_related_articles(search,fitted_pipeline):
    searchS = pd.Series(search)
    prepS = preprocessing(searchS)
    
    # Cluster where new search is located
    cluster = fitted_pipeline.predict(prepS)[0]
        
    # Series of documents within the same cluster as our search
    related_docs = docs[fitted_pipeline.named_steps['km'].labels_ == cluster]
    
    # Series of the top 3 related docs
    popular_docs = claps[related_docs.index].sort_values()[-3:]
    
    # Printing the original Title, Subtitle, and Claps for the top 3 related docs
    for index in popular_docs.index:
        print(data.iloc[index, 0:3], '\n')
        
    return



In [4]:
######################### Cleaning Data #########################
# Reverse Engineer Dummy Variables
categories = df.iloc[:,13:]
x = categories.stack()
tags = pd.Series(pd.Categorical(x[x!=0].index.get_level_values(1)))

# Only want 4 columns for my new dataset
newdf = df.loc[:, ['Title', 'Subtitle', 'Claps']]

# Tag for each article
newdf['Tags'] = tags

# Claps from floats to integers
newdf['Claps'] = newdf['Claps'].astype(int)

data = newdf.dropna()
data = data.reset_index(drop=True)

# Adding a space to the end of each title; preparing to add subtitle 
title_series = add_space(data['Title'])

# Need to reset indices of both in order to replace Title column
title_series = title_series.reset_index(drop=True)
data.reset_index(drop=True)

# Replace old titles with new titles+space
data['Title'] = title_series

# Join Title + Subtitle and save to a new 'Text' column
data["Text"] = data["Title"].map(str) + data["Subtitle"]

In [5]:
######################### Features #########################
docs = preprocessing(data['Text'])
claps = data['Claps']

In [6]:
######################### Pipeline #########################
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), 
                              binary=True, 
                              stop_words='english', 
                              min_df = 3, 
                              max_df = 0.75)), 
    ('lsa', TruncatedSVD(4)), 
    ('km', KMeans(n_clusters=4, 
                  init='k-means++', 
                  random_state=33))
])

pipeline.fit_transform(docs)
pipeline.predict(docs) # Just to see what cluster they were assigned

array([1, 1, 1, ..., 1, 1, 3], dtype=int32)

**Note:**  
After 2 example runs, it no longer gives new article results.  
I have to restart kernal. Not sure why. 

In [11]:
######################### Search Example #########################
example_search = "Can Neural Networks Develop Attention? Google Thinks They Can."
print('User Input:', example_search)
print('Inertia:', pipeline.named_steps['km'].inertia_, '\n')
top_related_articles(example_search, pipeline)

User Input: Can Neural Networks Develop Attention? Google Thinks They Can.
Inertia: 100.2806429526275 

Title             Why Fasting Is The Best Way To Lose Weight 
Subtitle    5 Scientifically-Backed Reasons Fasting Helps ...
Claps                                                   17300
Name: 29863, dtype: object 

Title       Why Isnt Agile Working? 
Subtitle           A couple drawings
Claps                          18200
Name: 38832, dtype: object 

Title       An Open Letter to the FCC: 
Subtitle    Dear FCC Chairman Ajit Pai:
Claps                             51000
Name: 13329, dtype: object 



In [8]:
# Just taking a closer look
print(data.iloc[24166, 0], '\n')

Hi! Its me again Im just wondering about how we can save the education system  



In [9]:
# Other examples
example_search1 = "Trump's Trying to Get Away With Something Even Nixon Couldn't"
example_search2 = "Is Artificial Intelligence Really Going to Put You Out of a Job??"