In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
df = pd.read_csv("Data/cleaned.csv")
df.head()

Unnamed: 0,Title,Genre,Synopsis
0,John Wick: Chapter 2,Action,['after returning to the criminal underworld t...
1,John Wick: Chapter 2,Crime,['after returning to the criminal underworld t...
2,John Wick: Chapter 2,Thriller,['after returning to the criminal underworld t...
3,FBI: Most Wanted,Action,['it follows the division of the fbi tasked wi...
4,FBI: Most Wanted,Crime,['it follows the division of the fbi tasked wi...


In [3]:
x_train, x_test, y_train, y_test = train_test_split(df['Synopsis'].values, df['Genre'].values, test_size = 0.2, random_state = 0)

In [4]:
vectorizer = TfidfVectorizer()
f = vectorizer.fit_transform(x_train)
ft = vectorizer.transform(x_test)

In [5]:
svm_model = SVC()     
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

svm_model.fit(f, y_train)
filename = 'Models/model_svm.sav'
pickle.dump(svm_model, open(filename, 'wb'))

In [6]:
mnb_model = MultinomialNB()
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

mnb_model.fit(f, y_train)
filename = 'Models/model_mnb.sav'
pickle.dump(mnb_model, open(filename, 'wb'))

In [7]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(f, y_train)
    temp_y_pred = temp_classifier.predict(ft)
    score = accuracy_score(y_test, temp_y_pred)
    print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
    if score>best_accuracy:
        best_accuracy = score
        alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 10.08%
Accuracy score for alpha=0.2 is: 14.19%
Accuracy score for alpha=0.3 is: 16.86%
Accuracy score for alpha=0.4 is: 19.07%
Accuracy score for alpha=0.5 is: 20.27%
Accuracy score for alpha=0.6 is: 21.29%
Accuracy score for alpha=0.7 is: 21.69%
Accuracy score for alpha=0.8 is: 22.07%
Accuracy score for alpha=0.9 is: 22.31%
Accuracy score for alpha=1.0 is: 22.53%
--------------------------------------------
The best accuracy is 22.53% with alpha value as 1.0
