# Imports and Loading Data

In [None]:
#Importing libraries.

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt 
import seaborn as sns
import re
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
bookDataset=pd.read_csv('data/bookDataset.csv')

#Getting only those columns that seems necessary
bookDataset=pd.DataFrame(bookDataset,columns=['genre','summary'])

print("Raw dataset")
bookDataset

In [None]:
bookDataset.groupby('genre').count()

# Data Preprocessing

**Step 1: Cleaning out inconsistent data**

In [None]:
def clean(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ''
    text = re.sub("\'", "", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower() 
    
    return text

bookDataset.loc[:,'summary']=bookDataset.loc[:,'summary'].apply(lambda x: clean(x))

print("Cleaning out any character which is not an alphabet and converting all text to lowercase \n\n")
bookDataset['summary']


**Step 2: Removing stop Words**

Stop words such as "the", "a", "an" are assumed to have no impact on the over all classification process.


In [None]:
# For mac user, use : others need to uncomment the following lines
#nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def removestopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

bookDataset['summary'] = bookDataset['summary'].apply(lambda x: removestopwords(x))

print("Removing Stop Words \n\n")
bookDataset['summary']


**Step 3: Lemmatization of summary**

Grouping of the different versions of the same word into one.

In [None]:
# For mac user, use : others need to uncomment the following lines
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemma=WordNetLemmatizer()

def lematizing(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = lemma.lemmatize(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

bookDataset['summary'] = bookDataset['summary'].apply(lambda x: lematizing(x))

print("After Lematization \n\n")
bookDataset['summary']

**Step 4: Stemming**

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


bookDataset['summary'] = bookDataset['summary'].apply(lambda x: stemming(x))
print("After Stemming \n\n")
bookDataset['summary']

# Predicting Genre

In [None]:
#Labeling each 'genre' with an unique number 

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

y=LE.fit_transform(bookDataset['genre'])

In [None]:
#perform inverse mapping on the unique numbers representing a genre.

LE.inverse_transform([0,1,2,3,4,5])

**Fitting the model**

In [None]:
#test_size 0.2 indicates that only 20% of the dataset is used for training while 80% will be used for testing
xtrain, xval, ytrain, yval = train_test_split(bookDataset['summary'], y, test_size=0.15, random_state=246) 

In [None]:
#Performing tf-idf 

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain.values.astype('U'))

xval_tfidf = tfidf_vectorizer.transform(xval.values.astype('U'))

**Using Logistic Regression**

In [None]:
start=datetime.now()

from sklearn.linear_model import LogisticRegression

# Binary Relevance.
from sklearn.multiclass import OneVsRestClassifier

# Performance metric.
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data.
clf.fit(xtrain_tfidf, ytrain)

# make predictions for validation set.
y_pred_lr = clf.predict(xval_tfidf)


from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  

#Calculating the accuracy.
print( 'Accuracy Score :',accuracy_score(yval,y_pred_lr) )

#Printing the classification report.
print ('Report : ')
print(classification_report(yval,y_pred_lr))

print("Executed in ",datetime.now()-start)

**Using SVM**

In [None]:
start=datetime.now()

from sklearn import svm

svc = svm.SVC(kernel='rbf',gamma=1).fit(xtrain_tfidf,ytrain)

svpred=svc.predict(xval_tfidf)

from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  


print( 'Accuracy Score :',accuracy_score(yval,svpred) )
print ('Report : ')
print(classification_report(yval,svpred))

print("Executed in ",datetime.now()-start)

# Creating Genre from text

In [None]:
def predictGenre(q):
    q = clean(q)
    q = removestopwords(q)
    q = lematizing(q)
    q = stemming(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = svc.predict(q_vec)
    return LE.inverse_transform(q_pred)[0]

In [43]:
#To read txt file, use read_fwf, to read csv use read_csv 
# rawDataSet =pd.read_csv('data/video_games.csv')
rawDataSet = pd.read_fwf('data/TinyStoriesV3-GPT4-valid.txt')

#We only need the first column, so get that and rename it for easier use
rawDataSet = rawDataSet.iloc[:, :1]
columnName = 'story' #Change the value as required 
# Since this data set did not have any column, it was assigned. Else you can use the existing column name
rawDataSet.columns = [columnName]

rawDataSubSet = rawDataSet.head(5000) #Can remove this later, there are 2 Million rows but just taking 5k from it for testing purpose
rawDataSet = rawDataSubSet[columnName].copy()

rawdf =pd.DataFrame(rawDataSet,columns=[columnName])
print (rawdf)

                                                  story
0     Once upon a time, in a warm and sunny place, t...
1     Tom and Lily were playing with their toys in t...
2     Once upon a time there was a little girl named...
3     One morning, a cat named Tom woke up. He felt ...
4     Lily and Tom were twins who liked to decorate ...
...                                                 ...
4995  Once upon a time, there was a girl named Mia. ...
4996  Once upon a time, there was a furry tiger. The...
4997  One day, a humble cat named Tom went for a wal...
4998  Once upon a time, there was a little girl name...
4999  Lily loved to play in the garden with her mom....

[5000 rows x 1 columns]


In [44]:

total_stories = len(rawdf)
print(f"Total stories to process: {total_stories}")

rawdf['genre'] = rawdf[columnName].apply(predictGenre)

# for index, row in rawdf.iterrows():
#     story = row[columnName]
#     genre = predictGenre(story)
#     logging.info(f"Processed story {index + 1}/{total_stories} - Predicted genre: {genre}")

#Make a new column for the dataframe


Total stories to process: 5000


In [45]:
# Need to know this genre so that it can be used in the generation code
unique_genres = rawdf['genre'].unique()
print(unique_genres)

# Train = ['Fantasy' 'Thriller' 'Historical novel' 'Horror' 'Crime Fiction''Science Fiction']
# Valid = ['Horror' 'Fantasy' 'Historical novel' 'Crime Fiction' 'Science Fiction', 'Thriller']

['Horror' 'Fantasy' 'Historical novel' 'Crime Fiction' 'Science Fiction'
 'Thriller']


In [42]:
rawdf.to_csv('storyValid.txt', index=False)

with open('storyTrain.txt', 'w') as file:
    for index, row in rawdf.iterrows():
        formatted_story = f"<BOS> <{row['genre']}> {row['story']} <EOS>\n"
        file.write(formatted_story)