In [1]:
# import required libraries
from keras.preprocessing.text import text_to_word_sequence
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from nltk import pos_tag
import pandas as pd
import numpy as np
import scipy
import nltk

In [2]:
# load the dataset
df = pd.read_csv("DA-AI-ML-interview-assignment-Data.csv")
df

Unnamed: 0.1,Unnamed: 0,title,rating,name,num_ratings,num_reviews,num_followers,synopsis,genre
0,0,Sapiens: A Brief History of Humankind,4.39,Yuval Noah Harari,806229,46149,30.5k,"100,000 years ago, at least six human species ...",history
1,1,"Guns, Germs, and Steel: The Fates of Human Soc...",4.04,Jared Diamond,367056,12879,6538,"""Diamond has written a book of remarkable scop...",history
2,2,A People's History of the United States,4.07,Howard Zinn,224620,6509,2354,"In the book, Zinn presented a different side o...",history
3,3,"The Devil in the White City: Murder, Magic, an...",3.99,Erik Larson,613157,36644,64.2k,Author Erik Larson imbues the incredible event...,history
4,4,The Diary of a Young Girl,4.18,Anne Frank,3313033,35591,4621,Discovered in the attic in which she spent the...,history
...,...,...,...,...,...,...,...,...,...
1534,1534,Hounded,4.09,Kevin Hearne,83827,7203,11905,"Atticus O’Sullivan, last of the Druids, lives ...",fantasy
1535,1535,Charlie and the Chocolate Factory,4.15,Roald Dahl,775001,14252,22897,Charlie Bucket's wonderful adventure begins wh...,fantasy
1536,1536,Red Rising,4.25,Pierce Brown,310138,30388,30510,"""I live for the dream that my children will be...",fantasy
1537,1537,Frostbite,4.26,Richelle Mead,337538,12435,66448,"Rose loves Dimitri, Dimitri might love Tasha, ...",fantasy


In [3]:
print(df.shape)
print()
print(df.columns)
print()
print(df.genre.unique())
print()
print(df.info())
print()
print(df.describe())

(1539, 9)

Index(['Unnamed: 0', 'title', 'rating', 'name', 'num_ratings', 'num_reviews',
       'num_followers', 'synopsis', 'genre'],
      dtype='object')

['history' 'horror' 'psychology' 'romance' 'science' 'science_fiction'
 'sports' 'thriller' 'travel' 'fantasy']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1539 entries, 0 to 1538
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1539 non-null   int64  
 1   title          1539 non-null   object 
 2   rating         1539 non-null   float64
 3   name           1539 non-null   object 
 4   num_ratings    1539 non-null   object 
 5   num_reviews    1539 non-null   object 
 6   num_followers  1539 non-null   object 
 7   synopsis       1539 non-null   object 
 8   genre          1539 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 108.3+ KB
None

        Unnamed: 0       rating
count  1539.000000  1539.000000
mean 

In [4]:
def preprocessing(train_text):
       
    #word tokenization using text-to-word-sequence
    train_text= str(train_text)
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    tokenized_train_set = text_to_word_sequence(train_text,
                                                filters = filters,
                                                lower = True,
                                                split=" ")
    #stop word removal
    stop_words = set(stopwords.words('english'))
    stopwordremove = [i for i in tokenized_train_set if not i in stop_words]
        
    #join words into sentence
    stopwordremove_text = ' '.join(stopwordremove)
        
    #remove numbers
    numberremove_text = ''.join(c for c in stopwordremove_text if not c.isdigit())
    
    #--Stemming--
    stemmer= PorterStemmer()
    stem_input=nltk.word_tokenize(numberremove_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
   
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = word_tokenize(stem_text)
    lemmatizer = WordNetLemmatizer()
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
    return lem_text

In [5]:
df["synopsis"] = df["synopsis"].apply(preprocessing)
synopsis = df["synopsis"]
genre = df['genre']
rating = df['rating']

# genre classification

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(synopsis,genre, test_size = 0.3, random_state = 60)

nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB())])

nb.fit(X_train,Y_train)

test_predict = nb.predict(X_test)

train_accuracy = nb.score(X_train,Y_train)
test_accuracy = accuracy_score(test_predict, Y_test)

print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy))
print("Naive Bayes Test Accuracy Score  : {}% ".format(test_accuracy))

Naive Bayes Train Accuracy Score : 0.5236768802228412% 
Naive Bayes Test Accuracy Score  : 0.4458874458874459% 


# rating prediction

In [7]:
x_train, x_test, y_train, y_test = train_test_split(synopsis,rating, test_size = 0.3, random_state = 60)

nbo = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', KNeighborsRegressor())])

nbo.fit(x_train,y_train)

predict = nbo.predict(x_test)

train_acc = nbo.score(x_train,y_train)
test_acc = r2_score(y_test,predict)

print("KNN Regressor Train Accuracy Score : {}% ".format(train_acc))
print("KNN Regressor R2 value : {}% ".format(test_acc))

KNN Regressor Train Accuracy Score : 0.4204683108915669% 
KNN Regressor R2 value : 0.15165400182754607% 
