In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/FeatureCreation/Final

/content/drive/MyDrive/FeatureCreation/Final


In [None]:
ls

Goose.ipynb  Goose_SVD.ipynb  movies_metadata.csv  SVD.ipynb


## Importing Packages

In [1]:
import os
import gzip

import pandas as pd
import numpy as np
import re
import json
import sys
import warnings
warnings.filterwarnings('ignore')


In [36]:
import sys
if 'absl.logging' in sys.modules:
    import absl.logging
    absl.logging.set_stderrthreshold('info')
import tensorflow as tf
import tensorflow_hub as hub

## Read file
The CSV must include columns
**title, imdbid, text**

In [37]:
enterFilePath = r"s.csv"
table = pd.read_csv(enterFilePath, encoding='utf8', sep=',')
table[['title', 'imdb', 'text']].head()

Unnamed: 0,title,imdb,text
0,lagaan,1234,cricket sport indian freedom movement land tax...


## Extract data From Meta Data

In [38]:
def parse_genres(genres):
    result = []
    for genre in json.loads(genres.replace("'", '"')):
        result.append(genre["name"])
    return result

def load_text(table):
    df = pd.DataFrame(columns=['title','imdb',"description"])
    # print(tags_table.isnull().sum())
    table= table.dropna().reset_index(drop = True)
    # print(tags_table.isnull().sum()) 
    id2tags = {}
    for i, record in table.iterrows():
        print(i)
        # if i > 10:
          # break
        title, imdb, description = record[["title", "imdb", "text"]]
        # genres = parse_genres(genres)
        df = df.append({'title':str(title),'imdb':str(imdb),  "description":str(description)}, ignore_index=True)
    # print(df.isnull().sum())
    # df.to_csv('descriptions2.tsv',sep = '\t', index = False)
    return df




In [39]:
df = load_text(table)

0


## Create Features using Sentence Encoder

In [40]:
df_features = df.copy()
df_features['features'] = ''
df_features.head()

Unnamed: 0,title,imdb,description,features
0,lagaan,1234,cricket sport indian freedom movement land tax...,


In [41]:
with tf.compat.v1.Session() as session:
  embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
  session.run([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()])
  def process(parts):
      embeddings = embed(parts)
      arr = session.run(embeddings)
      return arr[0]
  try:
    for i in range(len(df_features)):
      print(i)
      parts= [df_features.loc[i,'description']]
      aa = process(parts)
      df_features.loc[i,'features'] = ' '.join(map(str, list(aa)))
  except TypeError:
    pass


0
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [42]:
df_features.head()

Unnamed: 0,title,imdb,description,features
0,lagaan,1234,cricket sport indian freedom movement land tax...,-0.027159233 0.014159851 -0.038807206 -0.01953...


In [43]:
# df_features.to_csv('descriptions2_features.tsv',sep = '\t', index = False)

In [44]:
len(df_features['features'][0].split())

512

## Singular Value Decomposition

In [45]:
BINARIZE = False
df = df_features.copy()
def separateData(df):
  data = df.loc[:, df.columns != 'features'].to_numpy().tolist()
  features = []
  for fr in df['features'].to_numpy():
    map_object = map(np.float32, fr.split())
    list_of_integers = list(map_object)
    features.append(list_of_integers)
  return data, np.stack(features)

In [46]:
data, embeddings = separateData(df)

In [47]:
SKIP_DIM = 0
REDUCED_DIM = SKIP_DIM + 64
sess = tf.compat.v1.InteractiveSession()
x_tf = tf.Variable(embeddings)
# x_tf.compat.v1.initializer.run()
values, U, V = tf.compat.v1.svd(embeddings, full_matrices = True)


In [48]:
embeddings2 = V[::,SKIP_DIM:REDUCED_DIM]

emb = embeddings2
emb -= np.mean(emb,axis=0)
emb /= (1e-3 + np.sqrt(np.mean((emb**2),axis=0)))
# print(sys.stderr, emb)
if BINARIZE:
    threshold = 2e-1
    undefined = (emb >= -threshold) & (emb <= threshold)
    emb[emb > threshold] = 1
    emb[emb < -threshold] = 0
    emb[undefined] = -1
    emb = emb.astype(np.int8)
else:
        emb /= np.sqrt(np.sum((emb**2),axis=-1, keepdims=True)) + 1e-12

In [49]:
df['SVD'] = ''
for i in range(0,len(df)):
  df.loc[i,'SVD'] = ' '.join(map(str, list(np.array(emb[i]))))

In [50]:
df.head()

Unnamed: 0,title,imdb,description,features,SVD
0,lagaan,1234,cricket sport indian freedom movement land tax...,-0.027159233 0.014159851 -0.038807206 -0.01953...,-0.067640275 0.034465294 -0.115180664 -0.06074...


In [51]:
df.to_csv('descriptions_SVD.tsv',sep = '\t', index = False)

In [52]:
df['SVD'].iloc[0]

'-0.067640275 0.034465294 -0.115180664 -0.060740463 -0.125774 -0.14757365 -0.16095863 0.06963148 -0.019060606 0.10506612 -0.22962448 0.10630844 0.10603525 0.10321333 0.0258085 -0.09359846 0.13161954 -0.035830427 0.15371335 -0.07904347 -0.04187411 -0.010074174 -0.024439605 0.21504495 0.11052481 -0.073268406 -0.091718405 -0.07135312 0.030857787 0.07374909 0.08005645 -0.32827273 0.22867933 -0.15939653 -0.11208272 -0.07344995 -0.21812606 -0.12319861 0.19374461 0.18486495 -0.0075398553 -0.0697871 0.021788476 -0.025078502 0.14890684 -0.17681347 -0.10804693 0.22667849 -0.2419077 0.0078961 -0.009286128 -0.22954193 -0.024239307 -0.049479034 0.05271752 -0.13889784 -0.060340162 0.16813481 -0.07836661 -0.10826304 -0.016796023 -0.09634014 0.04690184 0.051866155'