# Content Based Recommendation

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords


## Data Preprocessing

In [4]:
DATA_DIR = os.path.abspath('./datasets/output/data_2.csv')
df = pd.read_csv(DATA_DIR,index_col=0,dtype=object)
df.head()


Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Interview,"﻿विडम्बना, ओलीको सत्ता सवारी पनि सिंहदरबारमा फ..."
1,Interview,﻿नेपाली कांग्रेसको सभापतिमा रामचन्द्र पौडेलको ...
2,Interview,﻿\nदुर्भाग्य नै भन्नुपर्छ हाम्रो प्राथमिकता पे...
3,Interview,"﻿नयनराज पाण्डे, लेखक\nविमोचन हुनै लागेको तपाईं..."
4,Interview,"﻿- भवन भट्ट, उपाध्यक्ष, गैर आवासीय नेपाली संघ\..."


In [5]:
df.shape

(5475, 2)

In [6]:
df.dropna(inplace=True) # drop rows with invalid values

df['id'] = df.index
df

Unnamed: 0_level_0,class,text,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Interview,"﻿विडम्बना, ओलीको सत्ता सवारी पनि सिंहदरबारमा फ...",0
1,Interview,﻿नेपाली कांग्रेसको सभापतिमा रामचन्द्र पौडेलको ...,1
2,Interview,﻿\nदुर्भाग्य नै भन्नुपर्छ हाम्रो प्राथमिकता पे...,2
3,Interview,"﻿नयनराज पाण्डे, लेखक\nविमोचन हुनै लागेको तपाईं...",3
4,Interview,"﻿- भवन भट्ट, उपाध्यक्ष, गैर आवासीय नेपाली संघ\...",4
...,...,...,...
5470,Entertainment,﻿उनले ‘टिमवर्क’ र ‘लिडरसिप’ फिल्ममेकिङको महत्व...,5470
5471,Entertainment,रुपा बनिन् हेरिटेज क्वीन नेपाल\n\n\nराजधानीको ...,5471
5472,Entertainment,﻿प्रकाश र किरणको अर्थ एउटै भए पनि कालो पोथी का...,5472
5473,Entertainment,दानापुर तालमा पर्यटक लोभिँदै\n\nअहिले पर्यटक ल...,5473


In [7]:
df.shape

(5475, 3)

## Processing data

In [8]:
nepali_stop_words = set(stopwords.words('nepali'))

# tf = TfidfVectorizer(stop_words=nepali_stop_words,encoding='utf-8',decode_error='ignore')
tf = TfidfVectorizer(tokenizer= lambda x: x.split(" "),
                                  sublinear_tf=True, encoding='utf-8',
                                  decode_error='replace',
                                  max_df=0.5,
                                  min_df=10,
                                  stop_words=nepali_stop_words)

tfidf_matrix = tf.fit_transform(df['text'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}



In [9]:
cosine_similarities.shape

(5475, 5475)

In [None]:
for idx, row in df.iterrows():
    try:

        idx = int(idx)
        similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
        similar_items = [(float(cosine_similarities[idx][i]), int(df['id'][i]), str(df['class'][i])) for i in similar_indices]
        results[row['id']] = similar_items[1:]
    except Exception as e: 
        print("ERROR",idx, e)
    
print('Completed!')
similar_indices = cosine_similarities[idx].argsort()[:-100:-1]

In [None]:
similar_indices.shape

## Displaying Results

In [None]:


def item(id):

    id = str(id)

    return {
        'text': df.loc[int(id)]['text'],
        'class': df.loc[int(id)]['class']
           }

def recommender(item_id, num): 
    print("\n")
    print("**Recommendation System**")
    print("The given news " + "'"+ item(item_id)['text'][:100] + "..." + "'" + " is of Category: " + str(item(item_id)['class'][:100]))
    
    print("\nThe " + str(num) + " most similar news in ascending order is given below: ")
    
    recs = results[int(item_id)][:num]
    for index, rec in enumerate(recs):
        print("\n")
        print(str(index+1) + ") " + "Category: " + str(rec[2]) + "\n\n" + "News: " + "'" + item(rec[1])['text'][:200] + "..." + "'" +" (score:" + str(rec[0]) + ")")

# Extract the news (that the user is currently interacting with) and provide 8 recommended news
recommender(item_id=120, num=10)

In [None]:
import joblib
joblib.dump(results,'similarity_matrix.joblib')