## Data Preprocessing

In [None]:
import pandas as pd
import csv

In [None]:
news = pd.read_table('news.tsv',
                     header=None,
                     usecols=[0, 3, 4],
                     quoting=csv.QUOTE_NONE,
                     names=[
                       'id', 'title','abstract'
                     ])

In [None]:
news.head()

Unnamed: 0,id,title,abstract
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


In [None]:
behaviors = pd.read_table(
    'behaviors.tsv',
    header=None,
    names=['impression_id', 'user', 'time', 'clicked_news', 'impressions'])

In [None]:
behaviors.clicked_news.fillna(' ', inplace=True)
behaviors.impressions = behaviors.impressions.str.split()

In [None]:
# behavior_user_example = behaviors[behaviors['user'] == 'U13740']
behavior_user_example = behaviors.copy()

In [None]:
behavior_user_example.head()

Unnamed: 0,impression_id,user,time,clicked_news,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[N55689-1, N35729-0]"
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[N20678-0, N39317-0, N58114-0, N20495-0, N4297..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[N50014-0, N23877-0, N35389-0, N49712-0, N1684..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,"[N35729-0, N33632-0, N49685-1, N27581-0]"
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,"[N39985-0, N36050-0, N16096-0, N8400-1, N22407..."


In [None]:
behavior_user_example['clicked_news'] = behavior_user_example['clicked_news'].str.split()

In [None]:
behavior_user_example_new = behavior_user_example.copy()

In [None]:
behavior_user_example_new['combined'] = behavior_user_example.apply(lambda row: row['clicked_news'] + row['impressions'], axis=1)
behavior_user_example_new = behavior_user_example_new.drop(columns=['clicked_news', 'impressions'])

In [None]:
behavior_user_example_new

Unnamed: 0,user,clicked_news,clicked_value
0,U13740,N55189,1
0,U13740,N42782,1
0,U13740,N34694,1
0,U13740,N45794,1
0,U13740,N18445,1
...,...,...,...
156963,U44625,N39317,0
156964,U64800,N61233,0
156964,U64800,N33828,1
156964,U64800,N19661,0


In [None]:
behavior_user_example_new = behavior_user_example_new.explode('combined')

In [None]:
def get_left_of_hyphen(s):
    parts = s.split("-")
    return parts[0] if len(parts) > 1 else s

In [None]:
def get_right_of_hyphen(s):
    parts = s.split("-")
    return parts[1] if len(parts) > 1 else 1

In [None]:
behavior_user_example_new['clicked_news'] = behavior_user_example_new['combined'].apply(get_left_of_hyphen)

In [None]:
behavior_user_example_new['clicked_value'] = behavior_user_example_new['combined'].apply(get_right_of_hyphen)

In [None]:
behavior_user_example_new = behavior_user_example_new.drop(columns=['impression_id', 'time'])

In [None]:
behavior_user_example_new = behavior_user_example_new.drop(columns=['combined'])

In [None]:
behavior_user_example_new = behavior_user_example_new.drop_duplicates()

In [None]:
news[news['clicked_news'] == 'N45794']

Unnamed: 0,clicked_news,title,abstract


In [None]:
news.rename(columns = {'id':'clicked_news'}, inplace = True)

In [None]:
behavior_merged = behavior_user_example_new.merge(news[['clicked_news','title', 'abstract']], on = 'clicked_news', how = 'inner')

In [None]:
behavior_merged.describe()

Unnamed: 0,user,clicked_news,clicked_value,title,abstract
count,5706634,5706634,5706634,5706634,5706634
unique,50000,48616,3,47799,47309
top,U63482,N47061,0,105 Black Friday Deals You Can Start Shopping ...,Save with early bird deals from stores like Wa...
freq,1440,16575,4594666,16575,16575


In [None]:
behavior_merged = behavior_merged.dropna()

In [None]:
behavior_merged[behavior_merged['clicked_value'] == '0']

Unnamed: 0,user,clicked_news,clicked_value,title,abstract
3871,U45632,N63302,0,This Wedding Photo of a Canine Best Man Captur...,"When Mark Doublet made his dog, Marley, the be..."
4231,U84229,N63302,0,This Wedding Photo of a Canine Best Man Captur...,"When Mark Doublet made his dog, Marley, the be..."
4305,U36488,N63302,0,This Wedding Photo of a Canine Best Man Captur...,"When Mark Doublet made his dog, Marley, the be..."
4319,U18021,N63302,0,This Wedding Photo of a Canine Best Man Captur...,"When Mark Doublet made his dog, Marley, the be..."
19063,U38751,N35458,0,Brazil oil auction a 'total disaster' as bidde...,Brazil's largest-ever auction of oil deposits ...
...,...,...,...,...,...
943644,U65895,N62092,0,"Marvel's Avengers game release date, news, tra...","After two years of secrecy, we've finally seen..."
943655,U69085,N13934,0,Things to do around Port Huron | November 2019,Community events happening around Port Huron i...
943764,U54826,N8075,0,Black Friday: The Best Deals on Kids' Toys and...,"From Legos to Frozen 2 merch, we've got you co..."
943811,U16965,N54013,0,This Louisville woman's tea shop will be an LG...,"Arielle Clark, a 28-year-old queer woman and L..."


In [None]:
behavior_merged['combined'] = behavior_merged['title'] + ' ' + behavior_merged['abstract']

In [None]:
behavior_merged.to_csv('main.csv', header=True)

## Recommendation System

In [None]:
import pandas as pd

behavior_merged = pd.read_csv('drive/MyDrive/main.csv')

In [None]:
behavior_merged.head()

Unnamed: 0.1,Unnamed: 0,user,clicked_news,clicked_value,title,abstract,combined
0,0,U13740,N55189,1,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
1,1,U10045,N55189,1,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
2,2,U85394,N55189,1,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
3,3,U78244,N55189,1,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
4,4,U27024,N55189,1,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import heapq

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import spacy

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization using NLTK
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove non-alphabetic characters
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]

    # Remove empty tokens
    tokens = [token for token in tokens if token.strip() != '']

    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
behavior_merged['combined'] = behavior_merged['combined'].apply(preprocess_text)

In [None]:
behavior_merged.to_csv('preprocessed_data.csv', header=True)

NameError: name 'behavior_merged' is not defined

In [None]:
vectorizer = TfidfVectorizer(min_df = 0)

In [None]:
corpus = behavior_merged['combined']

In [None]:
tfidf_matrix = vectorizer.fit_transform(behavior_merged['combined'].values)
tfidf_matrix

<5706634x54746 sparse matrix of type '<class 'numpy.float64'>'
	with 110806712 stored elements in Compressed Sparse Row format>

In [None]:
tfidf_dense = tfidf_matrix.toarray()
tfidf_dense

In [None]:
article_similarity = pairwise_distances(tfidf_dense, metric='jaccard')

TypeError: scipy distance metrics do not support sparse matrices.