# NLP Project Pt. 6: Building a Recommendation System

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',200)
doc_topic_df = pd.read_pickle('NMF_doc_topic_df3.pkl')
doc_topic_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0.047,0.064,0.011,0.023,0.088,0.093
1,0.044,0.022,0.012,0.259,0.0,0.013
2,0.131,0.018,0.0,0.0,0.001,0.026
3,0.113,0.025,0.039,0.0,0.013,0.02
4,0.011,0.01,0.0,0.0,0.017,0.151


In [2]:
corpus_df = pd.read_pickle('corpus_df_style3.pkl')
corpus_df.reset_index(inplace=True, drop=True)
corpus_df.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT,WORDCOUNT,TEXT_ENTS,POS_COUNT,NOUN_SCORE,VERB_SCORE,ADJ_SCORE,ADV_SCORE,AVG_SENT_LENGTH,LEX_RICH,COMBO_GROUP
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,ANNUNCIATION,LAUREN GROFF,"February 7, 2022","['Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. ...",9373,"[Palo Alto, Bay, Mountain View, New England, San Francisco, Chinatown, San Francisco, Redwood City, Mountain View, Titania, Germany, Feuerzangenbowle, Redwood City, Mountain View, New York, Caribb...","[1705, 1451, 586, 614]",0.391414,0.333104,0.134527,0.140955,25.38992,0.232551,5
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,ONCE REMOVED,ALEXANDER MACLEOD,"January 31, 2022","['She did not want to visit the old lady.', 'Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26a.m. It was time to go. Ninety degrees, ninety-pe...",7778,"[Inverness County, Nova Scotia, Ontario, Toronto, Montreal, Cape Breton, Turkey, Niagara Falls, Montreal]","[1231, 1266, 455, 682]",0.338745,0.348376,0.125206,0.187672,15.590643,0.223521,2
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,LONG DISTANCE,AYSEGUL SAVAS,"January 24, 2022","['Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She though...",4866,"[California, Rome, Rome, Rome, California, Rome, Trastevere, the Ponte Sublicio, Everest, Rome, Rome, Rome, California, Ostiense, Rome, San Pietro, Vincoli, Rome, Rome, London, California]","[763, 903, 284, 328]",0.334943,0.3964,0.124671,0.143986,15.984227,0.273125,2
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"WHAT'S THE DEAL, HUMMINGBIRD?",ARTHUR KRYSTAL,"January 17, 2022","['On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was seven...",3469,"[Prospect Park, Kentucky, Tanglewood, East, Provence, Montpellier, Nice, Brooklyn, West Orange, New Jersey, New Orleans, San Francisco, New York, Corfu, America, America, America, America, America...","[609, 603, 204, 187]",0.379913,0.37617,0.127261,0.116656,17.959799,0.379558,1
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,FIREWORKS,GRAHAM SWIFT,"January 10, 2022","['It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end.', 'It was a common remark: “Cheer up, it’s ...",2687,"[Cuba, Harpers]","[407, 424, 147, 202]",0.344915,0.359322,0.124576,0.171186,15.189189,0.311994,0


In [3]:
#function to return recommendations based on cosine similarity of topics
def NMF_cosine_similarity_recommender(df1, story_ix, df2, k):
    from sklearn.metrics.pairwise import cosine_similarity
    
    #generate compatibility score for each story pair
    match_holder = []
    story1 = df1.loc[story_ix].to_numpy()
    for i in range(len(df1)):
        if i != story_ix:
            story2 = df1.loc[i].to_numpy()
            story_compatibility = cosine_similarity(story1.reshape(1,-1), story2.reshape(1, -1))
            match_holder.append([i, story_compatibility])
    
    #select the 5 highest scores for each story
    matches = []
    stories = []
    
    for value in match_holder:
        story_num = value[0]
        score = value[1][0][0]
        matches.append(score)
        stories.append(story_num)

    matches_array = np.array(matches)
        
    part = np.argpartition(matches, -k)[-k:]
    indices = part[np.argsort((-matches_array)[part])]
    for ii in indices:
        print(df2.loc[story_ix].TITLE, df2.loc[stories[ii]].TITLE, df2.loc[stories[ii]].AUTHOR, matches_array[ii])
        

NMF_cosine_similarity_recommender(doc_topic_df, story_ix=0, df2=corpus_df, k=5)

ANNUNCIATION MARRIAGE QUARANTINE KATE WALBERT 0.9589821700462728
ANNUNCIATION THE VALETUDINARIAN JOSHUA FERRIS 0.9469192436384342
ANNUNCIATION WES AMERIGO'S GIANT FEAR DAVID SCHICKLER 0.9398567521246144
ANNUNCIATION BALLOONS THOMAS MCGUANE 0.9375361273443206
ANNUNCIATION THE MIDNIGHT ZONE LAUREN GROFF 0.9217798446607437


In [6]:
#function to return recommendations based on topic, style, and author
#story_ix = index # of story for which to make recommendations
def recommender(doc_topic_df, story_ix, story_df, num_recs, topicweight, styleweight, authorweight):
    from sklearn.metrics.pairwise import cosine_similarity
    
    recs = pd.DataFrame(columns=['TITLE','AUTHOR','URL',
                                 'TOPIC_SIMILARITY','STYLE_MATCH','AUTHOR_MATCH','REC_SCORE'])
    
    ## part 1: topic similarity
    #generate compatibility score for each story pair
    topic_match_holder = []
    story1 = doc_topic_df.loc[story_ix].to_numpy()
    for i in range(len(doc_topic_df)):
        if i != story_ix:
            story2 = doc_topic_df.loc[i].to_numpy()
            story_compatibility = cosine_similarity(story1.reshape(1,-1), story2.reshape(1, -1))
            topic_match_holder.append([i, story_compatibility])
    
    topic_matches = []
    topic_stories = []
    
    for tvalue in topic_match_holder:
        tstory_num = tvalue[0]
        tscore = tvalue[1][0][0]
        topic_matches.append(tscore)
        topic_stories.append(tstory_num)

    matches = np.array(topic_matches)
    
    indices = np.argsort((-matches))
    for ii in indices:
        if ii != story_ix:
            if (story_df.loc[ii].COMBO_GROUP == story_df.loc[story_ix].COMBO_GROUP) & \
            (story_df.loc[ii].AUTHOR == story_df.loc[story_ix].AUTHOR):
                recs.loc[ii] = [story_df.loc[ii].TITLE, story_df.loc[ii].AUTHOR, story_df.loc[ii].URL,
                            matches[ii], 1, 1, 0]
            elif (story_df.loc[ii].COMBO_GROUP == story_df.loc[story_ix].COMBO_GROUP):
                recs.loc[ii] = [story_df.loc[ii].TITLE, story_df.loc[ii].AUTHOR, story_df.loc[ii].URL,
                            matches[ii], 1, 0, 0]
            elif (story_df.loc[ii].AUTHOR == story_df.loc[story_ix].AUTHOR):
                recs.loc[ii] = [story_df.loc[ii].TITLE, story_df.loc[ii].AUTHOR, story_df.loc[ii].URL,
                            matches[ii], 0, 1, 0]
            else:
                recs.loc[ii] = [story_df.loc[ii].TITLE, story_df.loc[ii].AUTHOR, story_df.loc[ii].URL,
                            matches[ii], 0, 0, 0]
            
            
    #putting it all together
    recs['REC_SCORE'] = recs['TOPIC_SIMILARITY']*topicweight + recs['STYLE_MATCH']*styleweight + recs['AUTHOR_MATCH']*authorweight
    print(story_ix, story_df.loc[story_ix].TITLE, story_df.loc[story_ix].AUTHOR)
    return recs.sort_values(by=['REC_SCORE'], ascending=False)[0:num_recs]

In [7]:
recommender(doc_topic_df, story_ix=0, story_df=corpus_df, num_recs=10, topicweight=1, styleweight=1, authorweight=2)

0 ANNUNCIATION LAUREN GROFF


Unnamed: 0,TITLE,AUTHOR,URL,TOPIC_SIMILARITY,STYLE_MATCH,AUTHOR_MATCH,REC_SCORE
320,GHOSTS AND EMPTIES,LAUREN GROFF,https://www.newyorker.com/magazine/2015/07/20/ghosts-and-empties,0.563008,1,1,3.563008
134,BRAWLER,LAUREN GROFF,https://www.newyorker.com/magazine/2019/05/13/brawler,0.52196,1,1,3.52196
253,FLOWER HUNTERS,LAUREN GROFF,https://www.newyorker.com/magazine/2016/11/21/flower-hunters,0.780347,0,1,2.780347
216,DOGS GO WOLF,LAUREN GROFF,https://www.newyorker.com/magazine/2017/08/28/dogs-go-wolf,0.568793,0,1,2.568793
279,THE MIDNIGHT ZONE,LAUREN GROFF,https://www.newyorker.com/magazine/2016/05/23/the-midnight-zone-by-lauren-groff,0.552941,0,1,2.552941
172,UNDER THE WAVE,LAUREN GROFF,https://www.newyorker.com/magazine/2018/07/09/under-the-wave,0.534794,0,1,2.534794
51,THE WIND,LAUREN GROFF,https://www.newyorker.com/magazine/2021/02/01/the-wind,0.503612,0,1,2.503612
528,ABOVE AND BELOW,LAUREN GROFF,https://www.newyorker.com/magazine/2011/06/13/above-and-below,0.165649,0,1,2.165649
604,WAR DANCES,SHERMAN ALEXIE,https://www.newyorker.com/magazine/2009/08/10/war-dances,0.946919,1,0,1.946919
319,SILK BROCADE,TESSA HADLEY,https://www.newyorker.com/magazine/2015/07/27/silk-brocade,0.918279,1,0,1.918279


In [8]:
recommender(doc_topic_df, story_ix=22, story_df=corpus_df, num_recs=5, topicweight=1, styleweight=0.5, authorweight=0)

22 THE MOM OF BOLD ACTION GEORGE SAUNDERS


Unnamed: 0,TITLE,AUTHOR,URL,TOPIC_SIMILARITY,STYLE_MATCH,AUTHOR_MATCH,REC_SCORE
511,THE STAIN,TESSA HADLEY,https://www.newyorker.com/magazine/2011/11/07/the-stain,0.998557,1,0,1.498557
187,THE INTERMEDIATE CLASS,SAM ALLINGHAM,https://www.newyorker.com/magazine/2018/04/02/the-intermediate-class,0.958432,1,0,1.458432
717,FAITH,WILLIAM TREVOR,https://www.newyorker.com/magazine/2007/06/04/faith-5,0.934244,1,0,1.434244
353,REVEREND,TIM PARKS,https://www.newyorker.com/magazine/2014/12/08/reverend,0.91763,1,0,1.41763
715,ROY SPIVEY,MIRANDA JULY,https://www.newyorker.com/magazine/2007/06/11/roy-spivey,0.909554,1,0,1.409554


In [9]:
recommender(doc_topic_df, story_ix=30, story_df=corpus_df, num_recs=5, topicweight=1, styleweight=1, authorweight=1)

30 UNREAD MESSAGES SALLY ROONEY


Unnamed: 0,TITLE,AUTHOR,URL,TOPIC_SIMILARITY,STYLE_MATCH,AUTHOR_MATCH,REC_SCORE
917,OF MYSTERY THERE IS NO END,LEONARD MICHAELS,https://www.newyorker.com/magazine/2002/04/08/of-mystery-there-is-no-end,0.998907,1,0,1.998907
244,QUARANTINE,ALIX OHLIN,https://www.newyorker.com/magazine/2017/01/30/quarantine-by-alex-ohlin,0.998128,1,0,1.998128
691,THE VISITOR,MARISA SILVER,https://www.newyorker.com/magazine/2007/12/03/the-visitor-7,0.997703,1,0,1.997703
925,NACHMAN FROM LOS ANGELES,LEONARD MICHAELS,https://www.newyorker.com/magazine/2001/11/12/nachman-from-los-angeles,0.989505,1,0,1.989505
700,THE INSUFFERABLE GAUCHO,ROBERTO BOLANO,https://www.newyorker.com/magazine/2007/10/01/the-insufferable-gaucho,0.979926,1,0,1.979926
