# Preprocess data for LSTM and GPT models

We have to create special prompts in order to train our text generation models to generate the next words in the sentence or article. Below is the code used to subset our data of 16,000+ books to only 5,000 to speed up training.

In [None]:
import pandas as pd
filename = 'data/booksummaries.txt'
df = pd.read_csv(filename, sep="\t", 
                 names=['Wikipedia ID', 'Freebase ID', 'Title', 'Author', 'Publication Date', 'Genres', 'Summary'])

In [None]:
import json

# drop data with NaN values
df = df.drop_duplicates(subset=['Wikipedia ID'])
df = df.dropna(subset=['Genres', 'Summary'])
df['Genres'] = df['Genres'].map(lambda genre : list(json.loads(str(genre)).values()))

In [None]:
def get_genres_str(genre_list):
  genre_str = ''
  for g in genre_list:
    genre_str += (g + ', ')
  return genre_str[:-2]

In [None]:
# go thru data and clean up
new_data = []
df = df.reset_index()  # make sure indexes pair with number of rows

# create prefix which is the input prompt for our models
prefix = 'Generate a book summary with genres '

for index, row in df.iterrows():
    stringified_row = prefix + get_genres_str(row['Genres']) + ':\n'+ row['Summary']
    new_data.append(stringified_row)

print(new_data[0])

In [None]:
# create new dataframe
with_prompt_df = pd.DataFrame(new_data, columns=['Text'])
with_prompt_df.head()

In [None]:
# save only 5000 to a new file
compression_opts = dict(method='zip', archive_name='5000_booksummaries.csv')
with_prompt_df.head(5000).to_csv('data/5000_booksummaries.zip', compression=compression_opts,index=False)

In [None]:
# use for models: do not run here! copy and paste into respective model files
import pandas as pd
filename = 'data/5000_booksummaries.zip'
df = pd.read_csv(filename)
df.head(5)