# Yoonhyuck Woo, Rishika Thorat / Purdue University_Computer and Information Technology
# Final Project Title: Movie Recommendation System
# Professor: Jin Wei-Kocsis, Ph.D.

- Reference: ***https://medium.com/@AMustafa4983/sentiment-analysis-on-imdb-movie-reviews-a-beginners-guide-d5136ec74e56***
-Reference: ***https://calvinfeng.gitbook.io/machine-learning-notebook/supervised-learning/recommender/neural_collaborative_filtering***

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow
import keras
import re
import nltk
import string
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Flatten, Dropout, Dense, Input, Dot
from tensorflow.keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from nltk.stem.porter import *

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

SystemError: ignored

# IMDB DATASET

In [3]:
# Load Dataset
imdb_df = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv') # put your path here

In [4]:
imdb_df['sentiment']=imdb_df['sentiment'].replace({'positive':1, 'negative':0})

In [None]:
# check for bias
imdb_df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

# Function1: Removing Stopwords.

In [5]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    words = text.split()
    filtered_sentence = ''

    for word in words:
        if word not in stop_words:
            filtered_sentence = filtered_sentence + word + ' '
    return filtered_sentence

In [6]:
def normalize_text(text):
    text = text.lower()
    # get rid of urls
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # get rid of non words and extra spaces
    text = re.sub('\\W', ' ', text)
    text = re.sub('\n', '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('^ ', '', text)
    text = re.sub(' $', '', text)
    return text

In [7]:
def remove_punctuation(text):
    table = str.maketrans('','',string.punctuation)
    words = text.split()
    filtered_sentence = ''
    for word in words:
        word = word.translate(table)
        filtered_sentence = filtered_sentence + word + ' '
    return filtered_sentence

In [8]:
def stemming(text):
    ps = PorterStemmer()
    words = text.split()
    filtered_sentence = ''
    for word in words:
        word = ps.stem(word)
        filtered_sentence = filtered_sentence + word + ' '
    return filtered_sentence

In [9]:
def clean_text(text):
    text = text.lower()
    text = text.replace(',',' , ')
    text = text.replace('.',' . ')
    text = text.replace('/',' / ')
    text = text.replace('@',' @ ')
    text = text.replace('#',' # ')
    text = text.replace('?',' ? ')
    text = normalize_text(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = stemming(text)
    return text

In [10]:
with tf.device('/device:GPU:0'):

  for i in range (len(imdb_df["review"])):
    imdb_df["review"][i] = clean_text(imdb_df["review"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df["review"][i] = clean_text(imdb_df["review"][i])


In [11]:
X = imdb_df["review"]
y = imdb_df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=42,
                                                   shuffle=True)

# Tokenization and Padding

In [12]:
# important properties
vocab_size = 10000
max_length = 50

trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [13]:
# Define tokenizer and fit on texts
tokenizer.fit_on_texts(X_train)

In [14]:
# Let's Tokenize and pad texts
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_length,
                         padding=padding_type,
                         truncating=trunc_type)
X_test = pad_sequences(X_test, maxlen=max_length,
                         padding=padding_type,
                         truncating=trunc_type)

In [15]:
def sentiment_analysis_model():
    model = tf.keras.Sequential()

    model.add(Embedding(vocab_size, 64, input_length=max_length))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.2))  # Add dropout regularization

    model.add(LSTM(32, return_sequences=True))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))  # Add dropout regularization

    model.add(LSTM(32))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))  # Add dropout regularization

    # model.add(Dense(2, activation='softmax'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


    return model

# Movie Recommendation model

In [16]:
def CF_model(input_data):
  data = input_data
  user_embed_dim = 32
  movie_embed_dim = 32

  # User and Movie Input Layers
  user_input = Input(shape=(1,), name='user_input')
  movie_input = Input(shape=(1,), name='movie_input')

  # User and Movie Embedding Layers
  user_embedding = Embedding(input_dim=data['userId'].max()+1, output_dim=user_embed_dim, input_length=1)(user_input)
  movie_embedding = Embedding(input_dim=data['movieId'].max()+1, output_dim=movie_embed_dim, input_length=1)(movie_input)

  # Flatten the Embedding Layers
  user_flat = Flatten()(user_embedding)
  movie_flat = Flatten()(movie_embedding)

  # Concatenate User and Movie Embeddings
  # concat = Concatenate()([user_flat, movie_flat])
  concat = Dot(axes=1)([user_flat, movie_flat])

  # Dense Layer
  dense1 = Dense(130, activation='relu')(concat)

  # Output Layer
  output = Dense(1)(dense1)

  # Model
  model = Model(inputs=[user_input, movie_input], outputs=output)
  model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

  return model

# Generate Recommendation function

In [17]:
# dataset
movies = pd.read_csv('drive/MyDrive/ml-25m/movies.csv')
ratings = pd.read_csv('drive/MyDrive/ml-25m/ratings.csv')
tags = pd.read_csv('drive/MyDrive/ml-25m/tags.csv')

data = pd.merge(ratings, movies, on='movieId')

In [18]:
def generate_recommendations(user_id, model, top_n=10):
    CF_model = model
    user_movies = data[data['userId'] == user_id]['movieId'].unique()
    unrated_movies = movies[~movies['movieId'].isin(user_movies)]['movieId'].unique()

    predictions = CF_model.predict([pd.Series([user_id] * len(unrated_movies)), unrated_movies])
    movie_ratings = pd.DataFrame({'movieId': unrated_movies, 'predicted_rating': predictions.flatten()})
    top_recommendations = movie_ratings.nlargest(top_n, 'predicted_rating')
    top_recommendations = pd.merge(top_recommendations, movies, on = 'movieId')

    # print(new_frame)
    return top_recommendations

# Predict sentiment

In [19]:
def sentiment_predict(new_sentence, model, saved_model):
  # new_sentence = re.sub(r'[^a-zA-Z ]', '', new_sentence)
  # new_sentence = okt.morphs(new_sentence, stem=True) # Tokenizer
  # new_sentence = [word for word in new_sentence if not word in stopwords]

  # Create a new model instance
  sentiment_analysis_model = model

  # Load the previously saved weights
  sentiment_analysis_model.load_weights(saved_model)

  new_sentence = clean_text(new_sentence)
  encoded = tokenizer.texts_to_sequences([new_sentence]) # Encoding

  encoded = np.array(encoded)
  pad_new = pad_sequences(encoded, maxlen=max_length,
                         padding=padding_type,
                         truncating=trunc_type)
  score = float(sentiment_analysis_model.predict(pad_new)) # Prediction
  sentiment = 0

  if(score > 0.5):
    print("{:.2f}% positive review.\n".format(score * 100))
    sentiment = 1 # positive
  else:
    print("{:.2f}% negative review.\n".format((1 - score) * 100))
    sentiment = 0 # negative
  return sentiment

In [21]:
sentiment_predict('''Surrounded by the warmth of sunshine and the laughter of loved ones''', new_model, saved_model)

NameError: ignored

# Call the saved weights

In [28]:
CF_saved_model = "/content/drive/MyDrive/CF_training/ckpt-030.ckpt"
SA_saved_weights = "/content/drive/MyDrive/sentiment_training/ckpt-005.ckpt"

In [29]:
MR_model = CF_model(data)
MR_model.load_weights(CF_saved_model)
sa_model = sentiment_analysis_model()

In [35]:
recommendations = generate_recommendations(21, MR_model)
print("\nTop Movie Recommendations:")
print(recommendations[['title','genres', 'predicted_rating']].to_string(index=False))


Top Movie Recommendations:
                                                 title                           genres  predicted_rating
                       The girl With the Sulfur (2013)                 Children|Fantasy          4.419818
                              Land of No Return (1978)               Adventure|Children          4.419811
                             Tender Loving Care (1974)                            Drama          4.419807
                                          Грачи (1983)               (no genres listed)          4.419806
                                    Flying Boys (2004)                            Drama          4.419798
                               Babes In Toyland (1987) Adventure|Children|Drama|Fantasy          4.419798
                                    The Victors (1963)                        Drama|War          4.419795
Kurara: The Dazzling Life of Hokusai's Daughter (2017)                            Drama          4.419795
                  

#  Re-Rating: Categorizing

In [31]:
genere_for_negative = ['Comedy', 'Action', 'Adventure', 'Drama', 'Fantasy','Sci-Fi']
genre_for_positive = ['Comedy', 'Horror', "Children's", 'Romance', 'Musical', 'Animation']
neutral_genres = ['Documentary', 'Film-Noir','Mystery','Thriller','War','Western', '(no genres listed)']
genere_for_negative_new_lst = genere_for_negative + neutral_genres
genre_for_positive_new_lst = genere_for_negative + neutral_genres

In [32]:
def re_rank(data, new_lst):
  for i in range (len(data["genres"])):
    temp = []
    temp = data["genres"][i].split("|")
    for j in range(len(temp)):
      if temp[j] in genere_for_negative_new_lst:
        temp[j] = 1
      else:
        temp[j] = 0
    genre_point = sum(temp)
    data["predicted_rating"][i] = data["predicted_rating"][i] + genre_point
    temp = []
    sorted_data = data.sort_values(by='predicted_rating', ascending=False)

  return print(sorted_data[['title','genres', 'predicted_rating']].to_string(index=False))

# Demo

In [33]:
answer = input("How is your day?: ")
response = sentiment_predict(answer,sa_model, SA_saved_weights)
if response == 0:
  re_rank(recommendations, genere_for_negative_new_lst)
  print(0)
if response == 1:
  re_rank(recommendations, genre_for_positive_new_lst)
  print(1)

How is your day?: I feel so bad
96.19% negative review.

                                                 title                           genres  predicted_rating
                               Babes In Toyland (1987) Adventure|Children|Drama|Fantasy          7.419798
                                    The Victors (1963)                        Drama|War          6.419795
                              Framed for Murder (2007)                 Mystery|Thriller          6.419793
                       The girl With the Sulfur (2013)                 Children|Fantasy          5.419818
                              Land of No Return (1978)               Adventure|Children          5.419811
                             Tender Loving Care (1974)                            Drama          5.419807
                                          Грачи (1983)               (no genres listed)          5.419806
                                    Flying Boys (2004)                            Drama        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["predicted_rating"][i] = data["predicted_rating"][i] + genre_point


In [36]:
answer = input("How is your day?: ")
response = sentiment_predict(answer,sa_model, SA_saved_weights)
if response == 0:
  print("The following movie lists for when you are feeling bad")

  re_rank(recommendations, genere_for_negative_new_lst)
  print(0)
if response == 1:
  print("The following movie lists for when you are feeling good!")
  re_rank(recommendations, genre_for_positive_new_lst)
  print(1)

How is your day?: Surrounded by the warmth of sunshine and the laughter of loved ones, every moment becomes a cherished memory.
67.94% positive review.

The following movie lists for when you are feeling good!
                                                 title                           genres  predicted_rating
                               Babes In Toyland (1987) Adventure|Children|Drama|Fantasy          7.419798
                                    The Victors (1963)                        Drama|War          6.419795
                              Framed for Murder (2007)                 Mystery|Thriller          6.419793
                       The girl With the Sulfur (2013)                 Children|Fantasy          5.419818
                              Land of No Return (1978)               Adventure|Children          5.419811
                             Tender Loving Care (1974)                            Drama          5.419807
                                          Грачи 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["predicted_rating"][i] = data["predicted_rating"][i] + genre_point


In [None]:
Surrounded by the warmth of sunshine and the laughter of loved ones, every moment becomes a cherished memory.