In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re

# Loading data

In [2]:
folders = os.listdir('../data/bbc')
folders.remove('.DS_Store')
folders.remove('README.TXT')

# Putting all the data in a data frame

In [3]:
data = pd.DataFrame()
k=0
for f in folders:
    files = os.listdir('../data/bbc/{}'.format(f))
    for file in files:
        if (f != 'sport') & (file != '199.txt'):
            dt = open('../data/bbc/{}/{}'.format(f,file), 'r')
            data.loc[k,'topic'] = f
            data.loc[k,'article'] = dt.read()
            dt.close()
            k+=1

In [4]:
data.head()

Unnamed: 0,topic,article
0,entertainment,Musicians to tackle US red tape\n\nMusicians' ...
1,entertainment,"U2's desire to be number one\n\nU2, who have w..."
2,entertainment,Rocker Doherty in on-stage fight\n\nRock singe...
3,entertainment,Snicket tops US box office chart\n\nThe film a...
4,entertainment,Ocean's Twelve raids box office\n\nOcean's Twe...


# Data cleaning

In [5]:
def clean_data(row):
    # Remove Unicode
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', row)
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    # Lowercase the document
    document_test = document_test.lower()
    # Lowercase the numbers
    document_test = re.sub(r'[0-9]', '', document_test)
    # Remove the doubled space
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    return document_test

data['article_clean'] = data['article'].apply(clean_data)

# Creating TF-IDF matrix

In [6]:
def get_tfidf(data):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(data.loc[:,'article_clean'])
    X = X_tfidf.T.toarray()
    df = pd.DataFrame(X, index=vectorizer.get_feature_names())
    return df, vectorizer 

# Function to search the articles collection on query string

In [7]:
def get_similar_articles(q, data):
    print("query:", q)
    # Convert the query become a vector
    df, vectorizer = get_tfidf(data)
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    # Calculate the similarity
    for i in range(len(data)):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

    # Sort the values 
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    art = max(sim, key=sim.get)
    print("Cosine Similaritas:", sim[art])
    doc_idx = int(art)
    print(doc_idx)
    print("Document:", data.loc[doc_idx,'article'])
    # Print the articles and their similarity values
    

In [8]:
q1 = 'frontman movie'
get_similar_articles(q1, data)

query: frontman movie
Cosine Similaritas: 0.16644100210162358
211
Document: Lost Doors frontman movie found

Historians in Florida have discovered a 40-year-old clip of a clean-cut Jim Morrison appearing in a promotional film for his university.

The 1964 film shows the Doors frontman, who died aged 27 in 1971, playing the part of a young man who had been rejected by Florida State University. Morrison is seen quizzing a college administrator on why he was refused. "But what happened? How come my parents and the state and the university didn't look ahead?" he is seen asking.

"It's incredible. He's so clean cut and soft-spoken," said Florida state archivist Jody Norman. "We know he was at Florida State University for a period of time and he did some acting when he was there," Norman added. The Doors were one of the most influential bands of the 1960s, with hits including Light My Fire and Riders On The Storm. Morrison was notorious for his wild lifestyle - and was accused of exposing hi