In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
path='~/Desktop/Research Associate Assignment/Research-Aptitude-Test-Data/Q2_Dataset.xlsx'
df = pd.read_excel(path, dtype={'patentkey': int, 'text': str})

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   patentkey  100 non-null    int64 
 1   text       100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
None


In [4]:
print(df.text.head())

0    Compressing and decompressing text files  A me...
1    Method and operating system for executing prog...
2    Compressing and decompressing text files  A me...
3    Method of debugging a computer program  A meth...
4    Method of resetting sequence of access to exte...
Name: text, dtype: object


In [5]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
df['tokenized_text'] = df.apply(lambda row: tokenizer.tokenize(row['text']), axis=1)
df['num_words'] = df.apply(lambda row: len(row['tokenized_text']), axis=1)
print(df.head())

   patentkey                                               text  \
0    4955066  Compressing and decompressing text files  A me...   
1    5027273  Method and operating system for executing prog...   
2    5109433  Compressing and decompressing text files  A me...   
3    5124989  Method of debugging a computer program  A meth...   
4    5125087  Method of resetting sequence of access to exte...   

                                      tokenized_text  num_words  
0  [Compressing, and, decompressing, text, files,...        190  
1  [Method, and, operating, system, for, executin...        188  
2  [Compressing, and, decompressing, text, files,...        190  
3  [Method, of, debugging, a, computer, program, ...        206  
4  [Method, of, resetting, sequence, of, access, ...        230  


In [6]:
df['friqDist'] = df.apply(lambda row: nltk.FreqDist(w.lower() for w in row['tokenized_text']), axis=1)
print(df.head())

   patentkey                                               text  \
0    4955066  Compressing and decompressing text files  A me...   
1    5027273  Method and operating system for executing prog...   
2    5109433  Compressing and decompressing text files  A me...   
3    5124989  Method of debugging a computer program  A meth...   
4    5125087  Method of resetting sequence of access to exte...   

                                      tokenized_text  num_words  \
0  [Compressing, and, decompressing, text, files,...        190   
1  [Method, and, operating, system, for, executin...        188   
2  [Compressing, and, decompressing, text, files,...        190   
3  [Method, of, debugging, a, computer, program, ...        206   
4  [Method, of, resetting, sequence, of, access, ...        230   

                                            friqDist  
0  {'compressing': 3, 'and': 3, 'decompressing': ...  
1  {'method': 1, 'and': 5, 'operating': 4, 'syste...  
2  {'compressing': 3, 'and': 

In [7]:
nltk.download('stopwords') ##Run for first time download
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anahita.khanna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['friqExceptStopDist'] = df.apply(lambda row: nltk.FreqDist(w.lower() for w in row['tokenized_text'] if w.lower() not in stopwords), axis=1)

In [9]:
print(df.head())

   patentkey                                               text  \
0    4955066  Compressing and decompressing text files  A me...   
1    5027273  Method and operating system for executing prog...   
2    5109433  Compressing and decompressing text files  A me...   
3    5124989  Method of debugging a computer program  A meth...   
4    5125087  Method of resetting sequence of access to exte...   

                                      tokenized_text  num_words  \
0  [Compressing, and, decompressing, text, files,...        190   
1  [Method, and, operating, system, for, executin...        188   
2  [Compressing, and, decompressing, text, files,...        190   
3  [Method, of, debugging, a, computer, program, ...        206   
4  [Method, of, resetting, sequence, of, access, ...        230   

                                            friqDist  \
0  {'compressing': 3, 'and': 3, 'decompressing': ...   
1  {'method': 1, 'and': 5, 'operating': 4, 'syste...   
2  {'compressing': 3, 'and

In [10]:
df['most_common'] = df.apply(lambda row: row['friqExceptStopDist'].most_common(10), axis=1)
print(df['most_common'].head())

0    [(text, 13), (compressed, 8), (pass, 7), (comp...
1    [(mode, 9), (multi, 5), (operating, 4), (syste...
2    [(text, 13), (compressed, 8), (pass, 7), (comp...
3    [(debug, 12), (program, 9), (commands, 8), (ta...
4    [(segment, 8), (interrupt, 7), (code, 7), (met...
Name: most_common, dtype: object


In [11]:
df['text_cleaned'] = df.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stopwords) )

In [12]:
tfidfvectoriser = TfidfVectorizer()
tfidfvectoriser.fit(df.text_cleaned)
tfidf_vectors = tfidfvectoriser.transform(df.text_cleaned)

In [13]:
pairwise_similarities = np.dot(tfidf_vectors,tfidf_vectors.T).toarray()
pairwise_similarities_df = pd.DataFrame(pairwise_similarities)
pairwise_similarities_df.to_csv('~/Desktop/Research Associate Assignment/A2_cosine_similarity_scores.csv', index_label='patentkey')

In [14]:
def most_similar(patent_id, similarity_matrix, matrix):
    doc_id = df[df['patentkey'] == patent_id].index[0]
    print (f'Text: {df.iloc[doc_id]["text"]}')
    print ('\n')
    print ('Similar Text:')
    similar_ix = np.argsort(similarity_matrix[doc_id])[::-1]
    for ix in similar_ix:
        if ix == doc_id:
            continue
        print('\n')
        print (f'Patent Key: {df.iloc[ix]["patentkey"]}')
        print (f'Text: {df.iloc[ix]["text"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

In [15]:
most_similar(4955066, pairwise_similarities, 'Cosine Similarity')

Text: Compressing and decompressing text files  A method of compressing a text file in digital form is disclosed. A full text file having characters formed into phrases is provided by an author. The characters are digitally represented by bytes. A first pass compression is sequentially followed by a second pass compression of the text which has previously been compressed. A third or fourth level compression is serially performed on the previously compressed text. For example, in a first pass, the text is run-length compressed. In a second pass, the compressed text is further compressed with key phrase compression. In a third pass, the compressed text is further compressed with Huffman compression. The compressed text is stored in a text file having a Huffman decode tree, a key phrase table, and a topic index. The data is decompressed in a single pass and provided one line at a time as an output. Sequential compressing of the text minimizes the storage space required for the file. Decom