## Import Libraries

In [2]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px

## Import Data

### News

In [None]:
columns = ['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entites']
news_df = pd.read_csv('./MINDsmall_train/news.tsv', names=columns, header=None, sep='\t')
news_df.head()

In [None]:
news_df.info()

### Behavior

In [None]:
columns = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('./MINDsmall_train/behaviors.tsv', names=columns, header=None, sep='\t')
behaviors_df.head()

## Experiments

### Limiting to Title, Categoy, Subcategory, and Abstract

In [None]:
df = news_df[['Title', 'Category', 'SubCategory', 'Abstract']]
df.head()

### Drop rows with missing abstracts

In [None]:
df.info()

In [None]:
# dropping the Plot column
df.dropna(inplace=True)

df.info()

### Generating Bag of Words

In [None]:
# initializing the new column
df['Keywords'] = ""

for index, row in df.iterrows():
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(row['Abstract'])

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Keywords'] = list(key_words_dict_scores.keys())

# dropping the Abstract column
df.drop(columns = ['Abstract'], inplace = True)

df.head()

In [None]:
df['Bag of Words'] = df.apply(lambda x: ' '.join([x['Category']] + [x['SubCategory']] + x['Keywords']), axis=1)

# dropping the Category column
df.drop(columns = ['Category'], inplace = True)
# dropping the SubCategory column
df.drop(columns = ['SubCategory'], inplace = True)
# dropping the Keywords column
df.drop(columns = ['Keywords'], inplace = True)

df.head()

### Applying Count Vectorizer

In [None]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['Bag of Words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
# defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = df[df['Title'] == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

## Testing

In [None]:
indices = recommendations('50 Worst Habits For Belly Fat')

df.loc[indices]['Title']

In [None]:
text=' '
for i in news_df[news_df['Category']=='sports']['Title']:
    text+=i+' '
    
# Make the figure
wordcloud = WordCloud().generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

plt.show()

In [None]:
text=' '
for i in news_df[news_df['Category']=='news']['Title']:
    text+=i+' '
    
# Make the figure
wordcloud = WordCloud().generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

plt.show()

In [None]:
c=news_df[['Category','SubCategory']].value_counts()

index=[]
for i in c.index:
    index.append(np.array(i))
index=np.array(index)

In [None]:
df=pd.DataFrame(columns=['Category','Sub Category','Values'])
df['Category']=index[:,0]
df['Sub Category']=index[:,1]
df['Values']=c.values

px.bar(data_frame=df,x='Category',y='Values',color='Sub Category')