## AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation


In [160]:
import numpy as np
import pandas as pd


In [199]:
df = pd.read_csv('bbc-news-data.csv', sep='\t').sample(500, random_state = 115)

df.head()

Unnamed: 0,category,filename,title,content
587,entertainment,078.txt,Baby becomes new Oscar favourite,Clint Eastwood's boxing drama Million Dollar ...
1412,sport,100.txt,Mido makes third apology,Ahmed 'Mido' Hossam has made another apology ...
748,entertainment,239.txt,Fightstar take to the stage,Charlie Simpson took his new band Fightstar t...
1706,sport,394.txt,Tindall wants second opinion,England centre Mike Tindall is to seek a seco...
758,entertainment,249.txt,Soul sensation ready for awards,"South West teenage singing sensation, Joss St..."


In [162]:
df['category'].unique()

array(['entertainment', 'sport', 'politics', 'tech', 'business'],
      dtype=object)

In [163]:
#checking missing value

df.isnull().sum()

category    0
filename    0
title       0
content     0
dtype: int64

In [209]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define a custom tokenizer that removes stopwords and stems
def tokenize_and_stem(text):
    words = word_tokenize(text.lower())  # Tokenize and lower case the text
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]  # Remove stopwords and non-alphabetic words
    return [stemmer.stem(word) for word in filtered_words]  # Stem the remaining words

# Vectorize with the custom tokenizer
vectorizer = TfidfVectorizer(stop_words=None, tokenizer=tokenize_and_stem)
tfidf_matrix = vectorizer.fit_transform(documents)

# Example User Query
user_query = "economic forecasts and interest rates"
query_vector = vectorizer.transform([user_query])

# Compute Cosine Similarity
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

# Find Top Matches
top_indices = similarity_scores[0].argsort()[-5:][::-1]
for idx in top_indices:
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Cosine Similarity: {similarity_scores[0][idx]:.4f}")
    print(f"Content: {df.iloc[idx]['content'][:200]}...")  # First 200 characters
    print("-" * 50)


[nltk_data] Downloading package punkt to /Users/sally/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sally/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Title: US economy shows solid GDP growth
Cosine Similarity: 0.3506
Content:  The US economy has grown more than expected, expanding at an annual rate of 3.8% in the last quarter of 2004.  The gross domestic product figure was ahead of the 3.1% the government estimated a month...
--------------------------------------------------
Title: Consumer spending lifts US growth
Cosine Similarity: 0.2770
Content:  US economic growth accelerated in the third quarter, helped by strong consumer spending, official figures have shown.  The economy expanded at an annual rate of 3.7% in the July to September period, ...
--------------------------------------------------
Title: Irish markets reach all-time high
Cosine Similarity: 0.2017
Content:  Irish shares have risen to a record high, with investors persuaded to buy into the market by low inflation and strong growth forecasts.  The ISEQ index of leading shares closed up 23 points to 6661.8...
--------------------------------------------------
Title: 

In [207]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Initialize stemmer
stemmer = PorterStemmer()

# Define a custom tokenizer function that stems words
def tokenize_and_stem(text):
    words = word_tokenize(text)
    return [stemmer.stem(word) for word in words]

# Vectorize with stemming
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_and_stem)
tfidf_matrix = vectorizer.fit_transform(documents)

# Example User Query
user_query = "economic forecasts and interest rates"
query_vector = vectorizer.transform([user_query])

# Compute Cosine Similarity
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

# Find Top Matches
top_indices = similarity_scores[0].argsort()[-5:][::-1]
for idx in top_indices:
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Cosine Similarity: {similarity_scores[0][idx]:.4f}")
    print(f"Category: {df.iloc[idx]['category']}")
    print(f"Content: {df.iloc[idx]['content'][:200]}...")  # First 200 characters
    print("-" * 50)




Title: US economy shows solid GDP growth
Cosine Similarity: 0.3699
Category: business
Content:  The US economy has grown more than expected, expanding at an annual rate of 3.8% in the last quarter of 2004.  The gross domestic product figure was ahead of the 3.1% the government estimated a month...
--------------------------------------------------
Title: Consumer spending lifts US growth
Cosine Similarity: 0.2968
Category: business
Content:  US economic growth accelerated in the third quarter, helped by strong consumer spending, official figures have shown.  The economy expanded at an annual rate of 3.7% in the July to September period, ...
--------------------------------------------------
Title: Irish markets reach all-time high
Cosine Similarity: 0.2060
Category: business
Content:  Irish shares have risen to a record high, with investors persuaded to buy into the market by low inflation and strong growth forecasts.  The ISEQ index of leading shares closed up 23 points to 6661.8...
-

In [195]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessing: Focus on the 'content' column
documents = df['content'].tolist()

# Step 1: Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

# Example User Query
user_query = "the importance of mental health."
query_vector = vectorizer.transform([user_query])

# Step 2: Compute Cosine Similarity
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

# Step 3: Find Top Matches
top_indices = similarity_scores[0].argsort()[-5:][::-1]
print("Top Matches:")
for idx in top_indices:
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Category: {df.iloc[idx]['category']}")
    print(f"Content: {df.iloc[idx]['content'][:200]}...")  # First 200 characters
    print("-" * 50)


Top Matches:
Title: Custody death rate 'shocks' MPs
Category: politics
Content: death in custody have reached shocking level committee of mp and peer ha warned the joint committee on human right found those committing suicide were mainly the most vulnerable with mental health dru...
--------------------------------------------------
Title: MPs to debate 'euthanasia laws'
Category: politics
Content: mp are preparing to debate bill which critic claim would legalise euthanasia by the back door the bill would give legal force to living will where people say they want medical treatment withheld if th...
--------------------------------------------------
Title: Blair looks to election campaign
Category: politics
Content: tony blair big speech will be looked back on a the performance that kicked off the election campaign that poll may still be about week away but there can be little doubt left that the campaign is now ...
--------------------------------------------------
Title: Soderling win

In [197]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessing: Focus on the 'content' column
documents = df['content'].tolist()

# Step 1: Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

# Example User Query
user_query = "the importance of mental health."
query_vector = vectorizer.transform([user_query])

# Step 2: Compute Cosine Similarity
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

# Step 3: Find Top Matches
top_indices = similarity_scores[0].argsort()[-5:][::-1]

print("Top Matches:")
for idx in top_indices:
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Category: {df.iloc[idx]['category']}")
    print(f"Cosine Similarity: {similarity_scores[0][idx]:.4f}")  # Display cosine similarity
    print(f"Content: {df.iloc[idx]['content'][:200]}...")  # First 200 characters
    print("-" * 50)


Top Matches:
Title: Custody death rate 'shocks' MPs
Category: politics
Cosine Similarity: 0.1430
Content: death in custody have reached shocking level committee of mp and peer ha warned the joint committee on human right found those committing suicide were mainly the most vulnerable with mental health dru...
--------------------------------------------------
Title: MPs to debate 'euthanasia laws'
Category: politics
Cosine Similarity: 0.1111
Content: mp are preparing to debate bill which critic claim would legalise euthanasia by the back door the bill would give legal force to living will where people say they want medical treatment withheld if th...
--------------------------------------------------
Title: Blair looks to election campaign
Category: politics
Cosine Similarity: 0.0631
Content: tony blair big speech will be looked back on a the performance that kicked off the election campaign that poll may still be about week away but there can be little doubt left that the campaign is n

In [165]:
#Preprocessing text first

import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string


lemmatizer= WordNetLemmatizer()

#Function

def preprocess_text(text):
    """Cleans the input text by performing multiple NLP preprocessing steps."""
    
    # lowercase
    text = text.lower()
    
    # Remove the pound sign from hash tags
    text = re.sub(r'#', '', text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove standalone numbers but keep mixed alphanumeric terms
    text = re.sub(r'\b\d+\b', '', text) 
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # drop words with fewer than 2 characters
    tokens = [word for word in tokens if len(word) >= 2]


    # lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)


df['content']= df['content'].apply(preprocess_text)

In [166]:
df['content']

587     clint eastwoods boxing drama million dollar ba...
1412    ahmed mido hossam ha made another apology to t...
748     charlie simpson took his new band fightstar to...
1706    england centre mike tindall is to seek second ...
758     south west teenage singing sensation joss ston...
                              ...                        
2120    apple ha unveiled new lowcost macintosh comput...
1545    southampton are set to unveil harry redknapp a...
1390    britain kathy butler continued her impressive ...
1626    two moment of magic from brian odriscoll guide...
2134    the bbc news website take look at how game on ...
Name: content, Length: 500, dtype: object

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def compute_cosine_similarity(df, text_column):
    """
    Compute TF-IDF vectors and cosine similarity for text data.

    Parameters:
        df (pd.DataFrame): The input dataframe.
        text_column (str): The name of the column containing text data.

    Returns:
        similarity_matrix (pd.DataFrame): A dataframe representing cosine similarity between items.
    """
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()  # Limit features for efficiency
    
    # Transform the text data into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Convert the similarity matrix into a DataFrame for better readability
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=df.index,
        columns=df.index
    )
    
    return similarity_df


In [168]:
# new Compute cosine similarity for the 'content' column
similarity_df = compute_cosine_similarity(df, text_column='content')

# Display the similarity matrix
print(similarity_df.head())


          587       1412      748       1706      758       1026      2102  \
587   1.000000  0.116760  0.096446  0.111056  0.170354  0.103905  0.119693   
1412  0.116760  1.000000  0.122431  0.146032  0.079413  0.152531  0.140081   
748   0.096446  0.122431  1.000000  0.096227  0.080912  0.127676  0.105322   
1706  0.111056  0.146032  0.096227  1.000000  0.069349  0.133717  0.107287   
758   0.170354  0.079413  0.080912  0.069349  1.000000  0.101771  0.074881   

          497       776       481   ...      609       543       371   \
587   0.091422  0.149624  0.125961  ...  0.149723  0.203143  0.116716   
1412  0.104110  0.139836  0.144173  ...  0.123234  0.114374  0.098386   
748   0.093486  0.163899  0.117026  ...  0.130256  0.110010  0.098283   
1706  0.089449  0.110438  0.109919  ...  0.105268  0.090734  0.082520   
758   0.082950  0.152081  0.091993  ...  0.103490  0.156747  0.077121   

          1159      1638      2120      1545      1390      1626      2134  
587   0.181274 

In [169]:
def search(df, text_column, query, k=5):
    """
    Search for the most similar items to a user query based on cosine similarity.
    
    Parameters:
        df (pd.DataFrame): The input dataframe.
        text_column (str): The name of the column containing text data.
        query (str): The user's search query.
        k (int): The number of top recommendations to return.
        
    Returns:
        recommendations (pd.DataFrame): A dataframe with the top k recommended items.
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Combine the query with the existing content
    combined_text = df[text_column].tolist() + [query]
    
    # Initialize the TF-IDF vectorizer and transform the text
    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(combined_text)
    
    # Compute cosine similarity between the query and all items
    similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
    
    # Get the top k similar items
    top_indices = similarity_scores.argsort()[-k:][::-1]  # Indices of top k items
    recommendations = df.iloc[top_indices].copy()  # Explicitly create a copy
    recommendations['similarity'] = similarity_scores[top_indices]
    
    return recommendations[['title', 'category', 'similarity']]


In [170]:
# Sport:


user_query = "football scores"

# Search for top 5 recommendations
results = search(df, text_column='content', query=user_query, k=5)

results

Unnamed: 0,title,category,similarity
223,Man Utd to open books to Glazer,business,0.1614
980,UK's 'useless' quangos under fire,politics,0.155736
1674,Funding cut hits Wales Students,sport,0.127849
629,'My memories of Marley...',entertainment,0.07139
1935,Portable PlayStation ready to go,tech,0.062236


In [171]:
#Politics:



user_query = "the future of democracy"

# Search for top 5 recommendations
results = search(df, text_column='content', query=user_query, k=5)

results

Unnamed: 0,title,category,similarity
1175,Blair sees greater Bush consensus,politics,0.165449
1162,UKIP's secret weapon?,politics,0.132568
1067,Faith schools citizenship warning,politics,0.12633
1098,New foot and mouth action urged,politics,0.124972
1289,Economy focus for election battle,politics,0.110837


In [172]:
#Business:

user_query = "the future of cryptocurrency"

# Search for top 5 recommendations
results = search(df, text_column='content', query=user_query, k=5)

results

Unnamed: 0,title,category,similarity
1098,New foot and mouth action urged,politics,0.228383
1289,Economy focus for election battle,politics,0.199213
1410,Ronaldo considering new contract,sport,0.19655
2049,Millions to miss out on the net,tech,0.194739
88,Europe blames US over weak dollar,business,0.193982


In [173]:
#Entertainment:


user_query = "Big Brother"

# Search for top 5 recommendations
results = search(df, text_column='content', query=user_query, k=5)

results

Unnamed: 0,title,category,similarity
730,Bets off after Big Brother 'leak',entertainment,0.268556
712,Greer attacks 'bully' Big Brother,entertainment,0.245264
696,Double eviction from Big Brother,entertainment,0.079396
653,Eminem beats Elvis to number one,entertainment,0.06822
689,McCririck out of Big Brother show,entertainment,0.066285
