In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

df = pd.read_csv(r'preprocessed_dataset.csv')
# df.drop(['Director', 'Cast', 'Wiki Page', 'Origin/Ethnicity','Genre','Title','Release Year'], axis=1, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Release Year,Title,Genre,Plot
0,0,1901,Kansas Saloon Smashers,unknown,bartend work saloon serv drink custom fill ste...
1,1,1901,Love by the Light of the Moon,unknown,moon paint smile face hang park night young co...
2,2,1901,The Martyred Presidents,unknown,film minut long compos two shot first girl sit...
3,3,1901,"Terrible Teddy, the Grizzly King",unknown,last six one second consist two shot first sho...
4,4,1902,Jack and the Beanstalk,unknown,earliest known adapt classic fairytal film sho...
...,...,...,...,...,...
9066,9066,1974,Nightmare Honeymoon,unknown,newlyw david jill webb dack rambo rebecca dian...
9067,9067,1974,The Nine Lives of Fritz the Cat,unknown,one nine seven zero fritz cat marri welfar chi...
9068,9068,1974,The Parallax View,unknown,tv newswoman lee carter wit assassin president...
9069,9069,1974,Phantom of the Paradise,unknown,stori follow music compos singer winslow leach...


In [None]:
class preproc:
  def __init__(self, df):
      self.df = df
      for i in range(len(self.df['Plot'])):
        self.df.at[i,'Plot'] = self.preprocess(self.df.at[i,'Plot'])
      

  def remove_stop_words(self, data):
      stop_words = stopwords.words('english')
      words = word_tokenize(str(data))
      new_text = ""
      for w in words:
          if w not in stop_words:
              new_text = new_text + " " + w
      return np.char.strip(new_text)

  def convert_to_lower_case(self, data):
      for i in data:
        i = i.lower()
      return np.char.lower(data)

  def remove_punctuation(self, data):
      symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
      for i in range(len(symbols)):
          data = np.char.replace(data, symbols[i], ' ')
          data = np.char.replace(data, "  ", " ")
      data = np.char.replace(data, ',', '')
      return data

  def remove_apostrophe(self, data):
      return np.char.replace(data, "'", "")

  def remove_single_characters(self, data):
      words = word_tokenize(str(data))
      new_text = ""
      for w in words:
          if len(w) > 1:
              new_text = new_text + " " + w
      return np.char.strip(new_text)

  def stemming(self, data):
      stemmer= PorterStemmer()
      
      tokens = word_tokenize(str(data))
      new_text = ""
      for w in tokens:
          new_text = new_text + " " + stemmer.stem(w)
      return np.char.strip(new_text)

  def convert_numbers(self, data):
      data = np.char.replace(data, "0", " zero ")
      data = np.char.replace(data, "1", " one ")
      data = np.char.replace(data, "2", " two ")
      data = np.char.replace(data, "3", " three ")
      data = np.char.replace(data, "4", " four ")
      data = np.char.replace(data, "5", " five ")
      data = np.char.replace(data, "6", " six ")
      data = np.char.replace(data, "7", " seven ")
      data = np.char.replace(data, "8", " eight ")
      data = np.char.replace(data, "9", " nine ")
      return data

  def preprocess(self, data):     
      data = self.convert_to_lower_case(data)
      data = self.convert_numbers(data)
      data = self.remove_punctuation(data) #remove comma seperately
      data = self.remove_stop_words(data)
      data = self.remove_apostrophe(data)
      data = self.remove_single_characters(data)
      data = self.stemming(data)
      return str(data)


p = preproc(df)
df = p.df

LookupError: ignored

In [None]:
class tf_idf:
  def __init__(self, df):
      self.df = df     

  def create_tfidf_features(self, col, max_features=5000, max_df=0.95, min_df=2):
      """ Creates a tf-idf matrix for the `corpus` using sklearn. """
      corpus = df[col].tolist()
      tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                        stop_words='english', ngram_range=(1, 1), max_features=max_features,
                                        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                                        max_df=max_df, min_df=min_df)
      
      X = tfidf_vectorizor.fit_transform(corpus)
      arr = tfidf_vectorizor.get_feature_names_out()
      print(arr)
      print(len(arr))
      print(X)
      print('tfidf matrix successfully created.')
      
      return X, tfidf_vectorizor


t = tf_idf(df)
x, vectorizor = t.create_tfidf_features('Plot')

['abandon' 'abbi' 'abbott' ... 'zero' 'zombi' 'zoo']
5000
  (0, 2617)	0.062284020395352906
  (0, 1573)	0.17265324547643557
  (0, 3212)	0.08190593767165132
  (0, 201)	0.09716063362584261
  (0, 3445)	0.1750743300522388
  (0, 1627)	0.10305195539707956
  (0, 4867)	0.11898553994540288
  (0, 4261)	0.20765827165699868
  (0, 3708)	0.1924786393577195
  (0, 719)	0.1473057851333071
  (0, 548)	0.08728938447860295
  (0, 2956)	0.17265324547643557
  (0, 4175)	0.16112285501157836
  (0, 341)	0.12600713279405368
  (0, 4966)	0.1606678540272572
  (0, 402)	0.07898446971133972
  (0, 2003)	0.18346597080434443
  (0, 2098)	0.08678765171067274
  (0, 1404)	0.16112285501157836
  (0, 1625)	0.12161897613947487
  (0, 2082)	0.15869930621616282
  (0, 3578)	0.11128183331852454
  (0, 250)	0.15380910220796712
  (0, 2324)	0.11346224130014844
  (0, 618)	0.15786305235473239
  :	:
  (9070, 2684)	0.14316031053296643
  (9070, 2364)	0.09190833515187184
  (9070, 2946)	0.10207772979093505
  (9070, 3344)	0.13804075231429475
  (907

In [None]:
class query_preproc:
  def __init__(self, query):
    self.query = query
    self.query = str(self.preprocess(self.query))

  def remove_stop_words(self, data):
      stop_words = stopwords.words('english')
      words = word_tokenize(str(data))
      new_text = ""
      for w in words:
          if w not in stop_words:
              new_text = new_text + " " + w
      return np.char.strip(new_text)

  def convert_to_lower_case(self, data):
      for i in data:
        i = i.lower()
      return np.char.lower(data)

  def remove_punctuation(self, data):
      symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
      for i in range(len(symbols)):
          data = np.char.replace(data, symbols[i], ' ')
          data = np.char.replace(data, "  ", " ")
      data = np.char.replace(data, ',', '')
      return data

  def remove_apostrophe(self, data):
      return np.char.replace(data, "'", "")

  def remove_single_characters(self, data):
      words = word_tokenize(str(data))
      new_text = ""
      for w in words:
          if len(w) > 1:
              new_text = new_text + " " + w
      return np.char.strip(new_text)

  def stemming(self, data):
      stemmer= PorterStemmer()
      
      tokens = word_tokenize(str(data))
      new_text = ""
      for w in tokens:
          new_text = new_text + " " + stemmer.stem(w)
      return np.char.strip(new_text)

  def convert_numbers(self, data):
      data = np.char.replace(data, "0", " zero ")
      data = np.char.replace(data, "1", " one ")
      data = np.char.replace(data, "2", " two ")
      data = np.char.replace(data, "3", " three ")
      data = np.char.replace(data, "4", " four ")
      data = np.char.replace(data, "5", " five ")
      data = np.char.replace(data, "6", " six ")
      data = np.char.replace(data, "7", " seven ")
      data = np.char.replace(data, "8", " eight ")
      data = np.char.replace(data, "9", " nine ")
      return data

  def preprocess(self, data):     
      data = self.convert_to_lower_case(data)
      data = self.convert_numbers(data)
      data = self.remove_punctuation(data) #remove comma seperately
      data = self.remove_stop_words(data)
      data = self.remove_apostrophe(data)
      data = self.remove_single_characters(data)
      data = self.stemming(data)
      return str(data)

In [None]:
class query:
  def __init__(self):
      pass

  def calculate_similarity(self, X, vectorizor, query, top_k=10):
      """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
      the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

      q = query_preproc(query)
      query = q.query

      # Vectorize the query to the same length as documents
      query_vec = vectorizor.transform(query)
      # Compute the cosine similarity between query_vec and all the documents
      cosine_similarities = cosine_similarity(X,query_vec).flatten()
      # Sort the similar documents from the most similar to less similar and return the indices
      most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]
      return (most_similar_doc_indices, cosine_similarities)

  def show_similar_documents(self, df, cosine_similarities, similar_doc_indices):
      counter = 1
      for index in similar_doc_indices:
          print("Index", index)
          print('Top-{}, Similarity = {}'.format(counter, cosine_similarities[index]))
          print(df.iloc[index,:])
          print()
          counter += 1
  
  def query_similarity_ranked_docs(self, queries):
    output = {}
    for query in queries:
        q = [query]
        docs , simi = self.calculate_similarity(x, vectorizor, q, top_k = len(df))
        for i in docs:
            if(query in output) :
                if(simi[i] == 0):
                    continue
                output[query][i] = simi[i]
            else:
                output[query] = {}  
    return output

q = query()
# docs , simi = q.calculate_similarity(x, vectorizor, query1, top_k = len(df))
# print(docs)
queries = ["haunted place","funny comedy", "superhero","saves people","Alice follows rabbit hole"]
output = q.query_similarity_ranked_docs(queries)
print(df.iloc[5,:])
output

AttributeError: ignored