In [1]:
import pandas as pd 
import numpy as np 

In [2]:
# 7. Text Recommender: TF-IDF

In [3]:
%store -r df
%store -r headline_and_desc
%store -r category
%store -r sample_text
%store -r sample_pred_label

In [4]:
df = pd.concat([df, headline_and_desc], axis=1)
df = df.rename(columns={0: 'headline_and_desc'})
df.shape

(200853, 7)

In [5]:
df = df.loc[headline_and_desc != '']
df.shape

(200832, 7)

In [6]:
df.head()

Unnamed: 0,category,headline,authors,short_description,date,new_category,headline_and_desc
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...,2018-05-26,NEWS,mass shootings texas week tv leave husband kil...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.,2018-05-26,ENTERTAINMENT,smith joins diplo nicky jam world cup official...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,ENTERTAINMENT,hugh grant marrie time age actor longtime girl...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,The actor gives Dems an ass-kicking for not fi...,2018-05-26,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,"The ""Dietland"" actress said using the bags is ...",2018-05-26,ENTERTAINMENT,julianna margulies use donald trump poop bag p...


In [7]:
# tfidf vectorize the corpus

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

In [8]:
# 7.1. TF-IDF: headline_and_desc 

In [9]:
# define a function to compute the euclidean similarity

def headline_and_desc_tfidf_recommender(df, news, sample_text, num_similar_items):
    tfidf = TfidfVectorizer(min_df=0)
    tfidf.fit(news)
    news_vec = tfidf.transform(news)
    sample_vec = tfidf.transform(sample_text)

    distance = pairwise_distances(news_vec, sample_vec)
    indices = np.argsort(distance.ravel())[0:num_similar_items]
    news_list = pd.DataFrame({
        'headline': df['headline'][indices],
        'euclidean_similarity': distance.ravel()[indices]
    })
    return news_list

In [15]:
sample_text.values

array(['market soar despite election uncertainty stock market record big post election leap decade despite continue uncertainty candidate win race'],
      dtype=object)

In [10]:
headline_and_desc_tfidf_recommender(df, df['headline_and_desc'], sample_text, 10)

Unnamed: 0,headline,euclidean_similarity
56944,5 Ways Clutter Hurts Your Health (and How to F...,1.0
94402,This City Could Become The Next Detroit,1.0
121224,Florida Woman Bitten By Shark While Inner Tubing,1.0
43711,This Map Shows All The Bars Nearest To Your Po...,1.131386
103979,Lights Go On -- Part XXXIX -- YOU,1.135358
6112,Everything You Should Know About The Stock Market,1.141893
24634,Cuba's First 5-Star Luxury Hotel Opens Despite...,1.172244
95908,Is It Dementia Or Normal Aging? Now There's A ...,1.173689
34220,Donald Trump Is Taking Credit For A Meaningles...,1.177265
187835,Bill Maher Compares Dan Quayle To Sarah Palin ...,1.195311


In [11]:
# 7.2. TF-IDF: headline_and_desc and category

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# define a function to compute the euclidean similarity

def text_and_category_tfidf_recommender(df, news, sample_text, category, sample_category, num_similar_items, w1, w2):
    # tfidf vectorize text
    tfidf = TfidfVectorizer(min_df=0)
    tfidf.fit(news)
    news_vec = tfidf.transform(news)
    sample_vec = tfidf.transform(sample_text)

    # one-hot-encode the category
    onehotencoder = OneHotEncoder()

    category = category.values.reshape(-1,1)
    sample_category = sample_category.reshape(-1,1)

    onehotencoder.fit(category)
    category_onehot = onehotencoder.transform(category)
    sample_category_onehot = onehotencoder.transform(sample_category)

    # compute the euclidean similarity
    news_distance = pairwise_distances(news_vec, sample_vec)
    category_distance = pairwise_distances(category_onehot, sample_category_onehot)
    weighted_distance = (w1 * news_distance + w2 * category_distance)/float(w1+w2)
    indices = np.argsort(weighted_distance.ravel())[0:num_similar_items]
    news_list = pd.DataFrame({
        'headline': df['headline'][indices],
        'category': df['new_category'][indices],
        'euclidean_similarity': weighted_distance.ravel()[indices]
    })
    return news_list

In [16]:
sample_text.values

array(['market soar despite election uncertainty stock market record big post election leap decade despite continue uncertainty candidate win race'],
      dtype=object)

In [17]:
sample_pred_label

array(['NEWS'], dtype=object)

In [14]:
text_and_category_tfidf_recommender(df, df['headline_and_desc'], sample_text, df['new_category'], sample_pred_label, 10, 0.2, 0.8)

Unnamed: 0,headline,category,euclidean_similarity
95908,Is It Dementia Or Normal Aging? Now There's A ...,LIFE,0.234738
187835,Bill Maher Compares Dan Quayle To Sarah Palin ...,ENTERTAINMENT,0.239062
70301,How Earth Changed In 2015: The View From Far A...,NEWS,0.241127
114796,How The Kochs Are Organizing Against Your Loca...,POLITICS,0.241211
166974,SNL's 'Biden Bash' Is The Inauguration Party W...,ENTERTAINMENT,0.244679
66897,This Baby Goat Yogi Is Not Kidding Around,LIFE,0.247419
187837,Pennsylvania Public Defenders Rebel Against Cr...,NEWS,0.248248
42099,"With President Trump As Top Role Model, Teache...",LIFE,0.248633
100052,CD Review: 'The Great Escape' by The Great Escape,ENTERTAINMENT,0.249056
24985,Britain's Snap Election Takes A Surprising Tur...,NEWS,0.249777
