In [5]:
import pandas as pd
import re
import string
import numpy as np

# libraries for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

# Library to suppress warnings or deprecation notes
import warnings

warnings.filterwarnings("ignore")

In [6]:
# load the data
news = pd.read_csv('data/articles1.csv')
news2 = pd.read_csv('data/articles3.csv')
df_test = pd.read_csv('data/articles2.csv')

In [7]:
# reset the index
news.reset_index(drop=True, inplace=True)
news2.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# drop the unnamed column
news.drop(columns='Unnamed: 0', inplace=True)
news2.drop(columns='Unnamed: 0', inplace=True)
df_test.drop(columns='Unnamed: 0', inplace=True)



In [10]:
# subset the dataframe and get relevant columns
df = pd.concat([news, news2])
columns = ['id', 'title', 'content']
df = df[columns]
df_test = df_test[columns]
df.head()

Unnamed: 0,id,title,content
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."


In [18]:
df.shape[0]

92571

### Document Pre-processing


In [12]:
# function 1
def clean_text(text):
    """
    takes a text, remove the words with digits, 
    replace newline characters with space, remove URLs, 
    and replace everything that isn’t English alphabets with space.
    """
    text=re.sub('\w*\d\w*','', text)
    text=re.sub('\n',' ',text)
    text=re.sub(r"http\S+", "", text)
    text=re.sub('[^a-z]',' ',text)
    return text

def lower_case(text):
    return text.lower()

#### 1. remove capital letters

In [13]:
df['clean'] = df['content'].apply(lambda x: x.lower())
df_test['clean'] = df_test['content'].apply(lambda x: x.lower())

df.head(2)

Unnamed: 0,id,title,content,clean
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington — congressional republicans have...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","after the bullet shells get counted, the blood..."


#### 2. remove urls, words with numbers, newline characters with space

In [20]:
df['clean'] = df['clean'].apply(lambda x: clean_text(x))
df_test['clean'] = df_test['clean'].apply(lambda x: clean_text(x))
df.head(2)

Unnamed: 0,id,title,content,clean
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans have a ne...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",after the bullet shells get counted the blood ...


#### 3. remove the extra space that we added
        

In [21]:
df['clean'] = df['clean'].apply(lambda x: re.sub(' +',' ',x))
df_test['clean'] = df_test['clean'].apply(lambda x: re.sub(' +',' ',x))

df.head(2)

Unnamed: 0,id,title,content,clean
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans have a ne...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",after the bullet shells get counted the blood ...


#### 3. Remove stop words

In [22]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brianzuki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
# instantiate stopwords list
stop_words = stopwords.words("english")

def remove_stop_words(txt):
    """
    takes text and turns it into tokens using a word tokenizer
    then returns tokens not in the stopword list
    """
    tokens = word_tokenize(txt)
    text_with_no_stop_words = [
        token for token in tokens if not token in stop_words
    ]
    txt = " ".join(text_with_no_stop_words)

    return txt

df['tokens'] = df['clean'].apply(lambda x: remove_stop_words(x))
df_test['tokens'] = df_test['clean'].apply(lambda x: remove_stop_words(x))
df.head(2)

Unnamed: 0,id,title,content,clean,tokens
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans have a ne...,washington congressional republicans new fear ...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",after the bullet shells get counted the blood ...,bullet shells get counted blood dries votive c...


#### 4. Lemmatize the text

In [24]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

def lemmatize(txt):
    """
    takes in a clean text without punctuations and lemmatized it
    """
    word_list = nltk.word_tokenize(txt)
    # Lemmatize list of words and join
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

df['lemmatized'] = df['tokens'].apply(lambda x: lemmatize(x))
df_test['lemmatized'] = df_test['tokens'].apply(lambda x: lemmatize(x))
df.head(2)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brianzuki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,title,content,clean,tokens,lemmatized
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans have a ne...,washington congressional republicans new fear ...,washington congressional republican new fear c...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",after the bullet shells get counted the blood ...,bullet shells get counted blood dries votive c...,bullet shell get counted blood dry votive cand...


In [25]:
def preprocess(data):
    data = lower_case(data)
    data = clean_text(data)
    data = remove_stopwords(data)
    data = lemmatize(data)
    return data


## Creating Vectors
In this section, we’ll train our word2vec model and generate vectors for documents and queries in the testing set for information retrieval. But before that, we’ll prepare the dataset for training the word2vec model.

In [26]:
from gensim.models import Word2Vec

# Creating data for the model training
train_data=[]
for i in df.reset_index():
    train_data.append(i.split())

# Training a word2vec model from the given data set
w2v_model = Word2Vec(train_data, vector_size=1000, min_count=1, window=3, sg=1,workers=4)

In [None]:
# save the trained model for later use


In [34]:
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(1000) # 300 = vector_size
    else:
        for tok in doc_tokens:
            if tok in w2v_model.wv:
                embeddings.append(w2v_model.wv.word_vec(tok))
              
            else:
                embeddings.append(np.random.rand(1000))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [35]:
df['vector'] = df['lemmatized'].apply(lambda x :get_embedding_w2v(x.split()))
df_test['vector'] = df_test['lemmatized'].apply(lambda x :get_embedding_w2v(x.split()))
df.head(2)

Unnamed: 0,id,title,content,clean,tokens,lemmatized,vector
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans have a ne...,washington congressional republicans new fear ...,washington congressional republican new fear c...,"[0.496746014399116, 0.48928404110920937, 0.510..."
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",after the bullet shells get counted the blood ...,bullet shells get counted blood dries votive c...,bullet shell get counted blood dry votive cand...,"[0.49361261408773016, 0.5040055327335231, 0.50..."


#### making the pipeline

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
def ranking_ir(query):
  
    # pre-process Query
    query = query.lower()
    query = clean_text(query)
    query = re.sub(' +',' ',query)

    # generating vector
    vector = get_embedding_w2v(query.split())

    # ranking documents
    documents=df_test[columns].copy()
    documents['similarity'] = df_test['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
    documents.sort_values(by='similarity',ascending=False,inplace=True)

    return documents.head(10).reset_index(drop=True)

In [3]:
ranks = ranking_ir('machine learning and the advent of artificial intelligence')
ranks


NameError: name 'clean_text' is not defined