### Matthew Thompson

### Assignment 1

#### CAP 6676 Information Retrieval


Tasks:

- Install Python and NLTK (3 points)

- Tokenize the documents into words, remove stop words, and conduct stemming (5 points)

- Calculate tf-idf for each word in each document and generate document-word matrix (each element in the matrix is the tf-idf score for a word in a document) (7 points)

- Calculate pairwise cosine similarity for the documents (5 points)

In [100]:
import nltk, glob, os, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

Tokenize into words

In [182]:
def load_texts():
    """load texts from directory: return dictionary of texts"""
    
    docs = {}
    files = glob.glob(os.path.join(os.getcwd(), '*.txt'))
    for file in files:
        full_file_name = os.path.split(file)[-1]
        file_name = full_file_name.split('.')[0]
#         print(file_name)
        with open(file, 'r') as read_file:
            text = read_file.readlines()
            join_lines = ' '.join(text)
            strip_nl = [x.lower().strip() for x in join_lines.split()]
            docs[file_name] = ' '.join(strip_nl)
    return docs

load_texts()

{'100554newsML': 'channel tunnel operator eurotunnel on monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts. the long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt and unpaid interest throws the company a lifeline which could secure what is still likely to be a difficult future. the deal, announced simultaneously in paris and london, brings the company back from the brink of bankruptcy but leaves current shareholders, who have already seen their investment dwindle, owning only 54.5 percent of the company. "we have fixed and capped the interest payments and arranged only to pay what is available in cash," eurotunnel co-chairman alastair morton told reporters at a news conference. "avoiding having to do this again is the name of the game." morton said the plan provides the anglo-french company with the medium term financial stability to 

In [183]:
def tokenize_remove_stopwords_stem(doc_name, doc_text):
    """returns tokenized, stemmed text, without stop words"""
    
#     results = {}
    text_no_punct = re.sub(r'[^\w\s]','', doc_text)
    tokenized_text = nltk.word_tokenize(text_no_punct)
    no_stopwords = [x for x in tokenized_text if x not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(x) for x in no_stopwords]
#     results[doc_name] = stemmed

    return stemmed, tokenized_text, no_stopwords

docs = load_texts()
tokenize_remove_stopwords_stem('100618newsML', docs['100618newsML'])

(['anglofrench',
  'channel',
  'tunnel',
  'oper',
  'eurotunnel',
  'monday',
  'announc',
  'deal',
  'give',
  'creditor',
  'bank',
  '455',
  'percent',
  'compani',
  'return',
  'wipe',
  'one',
  'billion',
  'pound',
  '156',
  'billion',
  'debt',
  'mountain',
  'longawait',
  'restructur',
  'bring',
  'end',
  'month',
  'wrangl',
  'eurotunnel',
  '225',
  'bank',
  'owe',
  'nearli',
  'nine',
  'billion',
  'pound',
  'deal',
  'announc',
  'simultan',
  'pari',
  'london',
  'bring',
  'compani',
  'back',
  'brink',
  'insolv',
  'leav',
  'sharehold',
  'own',
  '545',
  'percent',
  'compani',
  'restructur',
  'plan',
  'provid',
  'eurotunnel',
  'medium',
  'term',
  'financi',
  'stabil',
  'allow',
  'consolid',
  'substanti',
  'commerci',
  'achiev',
  'date',
  'develop',
  'oper',
  'eurotunnel',
  'cochairman',
  'alastair',
  'morton',
  'said',
  'firm',
  'make',
  'profit',
  'interest',
  'ad',
  'although',
  'sharehold',
  'see',
  'interest',
  'd

Calculate tf-idf and generate document word matrix

In [224]:
# TfidfVectorizer

def tfidf_vect(docs):
    """creates tf-idf vector for all docs"""
    
    # create order
    doc_names = []
    doc_text = []
    for k,v in docs.items():
        doc_names.append(k)
        doc_text.append(v)
    all_text =  doc_text
    vectorizer = TfidfVectorizer()
    tfidf_vect = vectorizer.fit_transform(all_text)
    vocab = vectorizer.get_feature_names_out()
    tf_idf_ = pd.DataFrame(index = [vocab])
    for i, doc_name in enumerate(doc_names):
#         tf_idf_[doc_name] = np.zeros(tf_idf_.shape[0]) #works so why doesn't it populate with the actual data?
        tf_idf_[doc_name] = tfidf_vect.toarray()[i]

    return tf_idf_


tfidf_vect(docs)

Unnamed: 0,100554newsML,100593newsML,100618newsML
10,0.026060,0.038782,0.038769
113,0.013030,0.019391,0.019385
13,0.000000,0.032832,0.000000
130,0.013030,0.019391,0.019385
14,0.000000,0.032832,0.000000
...,...,...,...
wrangling,0.000000,0.024970,0.024961
wrestled,0.013030,0.019391,0.019385
year,0.022062,0.000000,0.000000
years,0.065151,0.019391,0.019385


Calculate pairwise cosine similarity for the documents

In [253]:
def cos_sim(df):
    """creates matrix of cosine similarities"""
    
    cols = df.columns
    cos_sim_matrix = pd.DataFrame({'doc': cols})
    for col in cols:
        for j in range(len(cols)):
            q = np.reshape(np.array(df[cols[i]]), (1, -1))
            p = np.reshape(np.array(df[cols[j]]), (1, -1))
#             cos_sim_matrix.iloc[i,col] = cosine_similarity(q, p)
            print(cosine_similarity(q, p))
    return cos_sim_matrix

cos_sim(main())

[[0.75443741]]
[[0.97444338]]
[[1.]]
[[0.75443741]]
[[0.97444338]]
[[1.]]
[[0.75443741]]
[[0.97444338]]
[[1.]]


Unnamed: 0,doc
0,100554newsML
1,100593newsML
2,100618newsML


In [238]:
test = pd.DataFrame()
for i in range(3):
    for j in range(3):
        test.loc[i,j] = j + i
test

Unnamed: 0,0,1,2
0,0.0,1.0,2.0
1,1.0,2.0,3.0
2,2.0,3.0,4.0


In [186]:
def main():
    """runs text processing"""
    
    docs = load_texts()
    cleaned_docs = {}
    for doc, text in docs.items():
        token_nostop_stem = tokenize_remove_stopwords_stem(doc, text)
        cleaned_docs[doc] = ' '.join(token_nostop_stem[0])#.values()
#     print(cleaned_docs)
    tf_idf = tfidf_vect(cleaned_docs)
    
    return tf_idf
main()

Unnamed: 0,100554newsML,100593newsML,100618newsML
10,0.052176,0.069287,0.072361
1040,0.026088,0.034644,0.036180
1135,0.026088,0.034644,0.036180
130,0.026088,0.034644,0.036180
136,0.000000,0.058657,0.000000
...,...,...,...
work,0.026088,0.034644,0.036180
would,0.182618,0.138574,0.144722
wrangl,0.000000,0.044610,0.046589
wrestl,0.026088,0.034644,0.036180
