### Matthew Thompson

### Assignment 1

#### CAP 6676 Information Retrieval


Tasks:

- Install Python and NLTK (3 points)

- Tokenize the documents into words, remove stop words, and conduct stemming (5 points)

- Calculate tf-idf for each word in each document and generate document-word matrix (each element in the matrix is the tf-idf score for a word in a document) (7 points)

- Calculate pairwise cosine similarity for the documents (5 points)

In [2]:
import nltk, glob, os, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

Tokenize into words

In [4]:
def load_texts():
    """load texts from directory: return dictionary of texts"""
    
    docs = {}
    files = glob.glob(os.path.join(os.getcwd(), '*.txt'))
    for file in files:
        full_file_name = os.path.split(file)[-1]
        file_name = full_file_name.split('.')[0]
#         print(file_name)
        with open(file, 'r') as read_file:
            text = read_file.readlines()
            join_lines = ' '.join(text)
            strip_nl = [x.lower().strip() for x in join_lines.split()]
            docs[file_name] = ' '.join(strip_nl)
    return docs

# load_texts()

In [5]:
def tokenize_remove_stopwords_stem(doc_name, doc_text):
    """returns tokenized, stemmed text, without stop words"""
    
#     results = {}
    text_no_punct = re.sub(r'[^\w\s]','', doc_text)
    tokenized_text = nltk.word_tokenize(text_no_punct)
    no_stopwords = [x for x in tokenized_text if x not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(x) for x in no_stopwords]
#     results[doc_name] = stemmed

    return stemmed, tokenized_text, no_stopwords

# docs = load_texts()
# tokenize_remove_stopwords_stem('100618newsML', docs['100618newsML'])

Calculate tf-idf and generate document word matrix

In [14]:
# TfidfVectorizer

def tfidf_vect(docs):
    """creates tf-idf vector for all docs"""
    
    # create order
    doc_names = []
    doc_text = []
    for k,v in docs.items():
        doc_names.append(k)
        doc_text.append(v)
    all_text =  doc_text
    vectorizer = TfidfVectorizer()
    tfidf_vect = vectorizer.fit_transform(all_text)
    vocab = vectorizer.get_feature_names_out()
    tf_idf_ = pd.DataFrame(index = [vocab])
    for i, doc_name in enumerate(doc_names):
#         tf_idf_[doc_name] = np.zeros(tf_idf_.shape[0]) #works so why doesn't it populate with the actual data?
        tf_idf_[doc_name] = tfidf_vect.toarray()[i]

    return tf_idf_


# tfidf_vect(docs)

Calculate pairwise cosine similarity for the documents

In [33]:
def cos_sim(df):
    """creates matrix of cosine similarities"""
    
    cols = df.columns
    results = ["Document : Document : Cosine similarity"]
#     cos_sim_matrix = pd.DataFrame({'doc': cols})
    for i, col in enumerate(cols):
        q = np.reshape(np.array(df[col]), (1, -1))
        for j in (cols):
#             q = np.reshape(np.array(df.loc[i, col]), (1, -1))
            p = np.reshape(np.array(df[j]), (1, -1))
            cos_sim = cosine_similarity(q, p)
            if col == j:
                continue
            else:
                results.append(f"{col} : {j} : {cos_sim}")
    return results

# cos_sim(main())

In [34]:
def cos_sim_calc(q, d):
    """calculates cosine similarity between vectors"""

    euclid = lambda x: np.linalg.norm(x)
    # skip calc if ==
    if np.all(q == d):
        return 1

    dot_prod = np.dot(q,d)
    denom = euclid(q) * euclid(d)

    # avoid /0
    if np.isclose(denom, 0, atol = 1e-32):
        sim = 0
    else:
        sim = dot_prod / denom
        
    return sim

In [35]:
def cos_sim_matrix(df):
    """creates matrix of cosine similarities"""
    
    cols = df.columns
    sim_matrix = pd.DataFrame()
        
    for i, col in enumerate(cols):
        for j, row in enumerate(cols):
            sim_matrix.loc[row, col] = cos_sim_calc(df[row], df[col])
    
    return sim_matrix

cos_sim_matrix(main())

Unnamed: 0,100554newsML,100593newsML,100618newsML
100554newsML,1.0,0.728454,0.754437
100593newsML,0.728454,1.0,0.974443
100618newsML,0.754437,0.974443,1.0


In [None]:
test = pd.DataFrame()
for i in range(3):
    for j in range(3):
        test.loc[i,j] = j + i
test

In [40]:
def main():
    """runs text processing"""
    
    docs = load_texts()
    cleaned_docs = {}
    for doc, text in docs.items():
        token_nostop_stem = tokenize_remove_stopwords_stem(doc, text)
        cleaned_docs[doc] = ' '.join(token_nostop_stem[0])#.values()
#     print(cleaned_docs)
    tf_idf = tfidf_vect(cleaned_docs)
#     return tf_idf
    cosine_sim_print = cos_sim(tf_idf)
    corr_matrix = cos_sim_matrix(tf_idf)
    return tf_idf, cosine_sim_print, corr_matrix
main()

(        100554newsML  100593newsML  100618newsML
 10          0.052176      0.069287      0.072361
 1040        0.026088      0.034644      0.036180
 1135        0.026088      0.034644      0.036180
 130         0.026088      0.034644      0.036180
 136         0.000000      0.058657      0.000000
 ...              ...           ...           ...
 work        0.026088      0.034644      0.036180
 would       0.182618      0.138574      0.144722
 wrangl      0.000000      0.044610      0.046589
 wrestl      0.026088      0.034644      0.036180
 year        0.156529      0.034644      0.036180
 
 [271 rows x 3 columns],
 ['Document : Document : Cosine similarity',
  '100554newsML : 100593newsML : [[0.72845372]]',
  '100554newsML : 100618newsML : [[0.75443741]]',
  '100593newsML : 100554newsML : [[0.72845372]]',
  '100593newsML : 100618newsML : [[0.97444338]]',
  '100618newsML : 100554newsML : [[0.75443741]]',
  '100618newsML : 100593newsML : [[0.97444338]]'],
               100554newsML