# Movie Genre Prediction

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pickle

In [2]:
df=pd.read_csv("kaggle_movie_train.csv")

In [3]:
df.head()

Unnamed: 0,id,text,genre
0,0,"eady dead, maybe even wishing he was. INT. 2ND...",thriller
1,2,"t, summa cum laude and all. And I'm about to l...",comedy
2,3,"up Come, I have a surprise.... She takes him ...",drama
3,4,ded by the two detectives. INT. JEFF'S APARTME...,thriller
4,5,"nd dismounts, just as the other children reach...",drama


In [4]:
df["genre"].nunique()

9

In [5]:
df["genre"].unique()

array(['thriller', 'comedy', 'drama', 'action', 'sci-fi', 'other',
       'romance', 'horror', 'adventure'], dtype=object)

In [6]:
df=df[df["genre"]!='other']

In [7]:
df.drop(columns=['id'],inplace=True)

In [8]:
df.shape

(22309, 2)

In [9]:
# Cleaning the text present in the feature column

def text_cleaned(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z]'," ",text)
    text = re.sub(r'\\n'," ",text)
    text = re.sub(r'\W'," ",text)
    text = re.sub(r'https\s+|www.\s+'," ",text)
    text = re.sub(r'http\s+|www.\s+'," ",text)
    text = re.sub(r'\s+[a-zA-Z]\s+'," ",text)
    text = re.sub(r'\^[a-zA-Z]\s+'," ",text)
    text = re.sub(r"won\'t", "will not",text)
    text = re.sub(r'can\'t', "can not",text)
    text = re.sub(r'don\'t', "do not",text)
    text = re.sub(r'dont', "do not",text)
    text = re.sub(r'n\’t', " not",text)
    text = re.sub(r'n\'t', " not",text)
    text = re.sub(r'\'re', " are",text)
    text = re.sub(r'\'s', " is",text)
    text = re.sub(r'\’d', " would",text)
    text = re.sub(r'\d', " would",text)
    text = re.sub(r'\'ll', " will",text)
    text = re.sub(r'\'t', " not",text)                  
    text = re.sub(r'\'ve', " have",text)
    text = re.sub(r'\'m', " am",text)
    text = re.sub(r'\n', "",text)
    text = re.sub(r'\r', "",text)
    text = re.sub(r'[0-9]', "digit",text)
    text = re.sub(r"\'", "",text)
    text = re.sub(r"\"", "",text)
    text = re.sub(r'[?|!|\'|"|#]',r'',text)
    text = re.sub(r'[.|,|)|(|\|/]',r' ',text)    
    text = re.sub(r'\s+', ' ',text)
    return text

df["text"] = df["text"].apply(text_cleaned)

In [10]:
df.head()

Unnamed: 0,text,genre
0,eady dead maybe even wishing he was int nd flo...,thriller
1,t summa cum laude and all and m about to launc...,comedy
2,up come have surprise she takes him by the ha...,drama
3,ded by the two detectives int jeff apartment n...,thriller
4,nd dismounts just as the other children reach ...,drama


In [11]:
genre_dict={'thriller':1, 'comedy':2, 'drama':3, 'action':4, 'sci-fi':5,
       'romance':6, 'horror':7, 'adventure':8}

In [12]:
df['genre']=df['genre'].map(genre_dict)

In [13]:
X = df.drop(labels = ["genre"],axis =1)
y = df["genre"].values

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score , f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [15]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [16]:
tf1 = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english' , token_pattern = r'\w{3,}')

tfidf_train_text = tf1.fit_transform(X_train["text"])
tfidf_test_text = tf1.transform(X_test["text"])

In [17]:
# Generating pickle file for tf-idf
pickle.dump(tf1, open('tfidf-transform.pkl', 'wb'))

In [18]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC

In [19]:
best_params = {'alpha': 0.5}
nb = MultinomialNB(**best_params)
nb.fit(tfidf_train_text, y_train)


MultinomialNB(alpha=0.5)

In [20]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'movie-genre-mnb-model.pkl'
pickle.dump(nb, open(filename, 'wb'))