# Project on:-- Movies Recommendation Engine Using Of (streamlit)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
pd.set_option('display.max_columns',None) # display all columns

In [3]:
df = pd.read_csv('MoviesTopRated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,genre_ids,title,overview,popularity,release_date,vote_average,vote_count
0,0,238,"[18, 80]",The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",119.438,1972-03-14,8.7,18448
1,1,278,"[18, 80]",The Shawshank Redemption,Framed in the 1940s for the double murder of h...,90.415,1994-09-23,8.7,24376
2,2,240,"[18, 80]",The Godfather Part II,In the continuing saga of the Corleone crime f...,70.637,1974-12-20,8.6,11144
3,3,424,"[18, 36, 10752]",Schindler's List,The true story of how businessman Oskar Schind...,48.096,1993-12-15,8.6,14421
4,4,19404,"[35, 18, 10749]",Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",26.588,1995-10-20,8.6,4225


In [4]:
df['title']=df['title'].apply(lambda n: n.strip())

In [5]:
print(df.columns.tolist())

['Unnamed: 0', 'id', 'genre_ids', 'title', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count']


In [6]:
subset = df.select_dtypes('object').copy()
subset

Unnamed: 0,genre_ids,title,overview,release_date
0,"[18, 80]",The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14
1,"[18, 80]",The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23
2,"[18, 80]",The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20
3,"[18, 36, 10752]",Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15
4,"[35, 18, 10749]",Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20
...,...,...,...,...
9995,"[35, 12, 28]",OSS 117: From Africa with Love,"1981. Hubert Bonisseur de la Bath, aka OSS 117...",2021-08-04
9996,"[878, 18]",Z for Zachariah,"In the wake of a nuclear war, a young woman su...",2015-08-13
9997,"[35, 28]",First Sunday,Durell and LeeJohn are best friends and bumbli...,2008-01-11
9998,"[35, 80, 10749]",Mickey Blue Eyes,An English auctioneer proposes to the daughter...,1999-08-16


In [7]:
print(subset.columns.tolist())

['genre_ids', 'title', 'overview', 'release_date']


In [8]:
subset.dropna(inplace=True)

In [9]:
subset['text'] = subset['title'] + subset['overview']

In [10]:
#stopword
from nltk.corpus import stopwords
#tokenize
from nltk.tokenize import word_tokenize
#Lemmatize
from nltk.stem import PorterStemmer

In [11]:
def preprocessor(text):
    # tokenize
    tokens =word_tokenize(text)
    print(tokens)
    #remove stopward
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    print(tokens)
    #stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    print(tokens)
    return " ".join(tokens)

In [12]:
combined_features=subset['text']

In [13]:
df.shape

(10000, 9)

In [14]:
subset['text'] # out final data

0       The GodfatherSpanning the years 1945 to 1955, ...
1       The Shawshank RedemptionFramed in the 1940s fo...
2       The Godfather Part IIIn the continuing saga of...
3       Schindler's ListThe true story of how business...
4       Dilwale Dulhania Le JayengeRaj is a rich, care...
                              ...                        
9995    OSS 117: From Africa with Love1981. Hubert Bon...
9996    Z for ZachariahIn the wake of a nuclear war, a...
9997    First SundayDurell and LeeJohn are best friend...
9998    Mickey Blue EyesAn English auctioneer proposes...
9999    Jay and Silent Bob RebootJay and Silent Bob em...
Name: text, Length: 9998, dtype: object

In [15]:
movies=subset[:5000].copy

In [16]:
movies

<bound method NDFrame.copy of              genre_ids                        title  \
0             [18, 80]                The Godfather   
1             [18, 80]     The Shawshank Redemption   
2             [18, 80]        The Godfather Part II   
3      [18, 36, 10752]             Schindler's List   
4      [35, 18, 10749]  Dilwale Dulhania Le Jayenge   
...                ...                          ...   
4995          [35, 18]                Human Traffic   
4996              [18]                        Goal!   
4997  [80, 28, 35, 53]                   First Love   
4998     [28, 12, 878]                    Bumblebee   
4999     [878, 28, 12]                  Superman II   

                                               overview release_date  \
0     Spanning the years 1945 to 1955, a chronicle o...   1972-03-14   
1     Framed in the 1940s for the double murder of h...   1994-09-23   
2     In the continuing saga of the Corleone crime f...   1974-12-20   
3     The true story 

In [17]:
Vectorizer = TfidfVectorizer()

In [18]:
feature_vectors = Vectorizer.fit_transform(combined_features)

In [19]:
print(feature_vectors)

  (0, 27894)	0.12122155604004882
  (0, 4123)	0.15252441156911428
  (0, 5375)	0.1646186290647232
  (0, 19141)	0.204514021386624
  (0, 18528)	0.14857626460213225
  (0, 3342)	0.07927597962773425
  (0, 36770)	0.12905658590607644
  (0, 5531)	0.14283356349016327
  (0, 32677)	0.09862263996699665
  (0, 16774)	0.038960758155327194
  (0, 31616)	0.15842641462306387
  (0, 21510)	0.15399719333975884
  (0, 30896)	0.10011809412295603
  (0, 37059)	0.177824928683404
  (0, 19526)	0.06969201906005121
  (0, 15732)	0.08962187371984055
  (0, 23688)	0.05485407331850818
  (0, 2600)	0.12536358639129436
  (0, 1719)	0.05499982593666725
  (0, 32381)	0.1740800754475442
  (0, 3154)	0.16541520728442063
  (0, 35603)	0.21436417140755562
  (0, 24640)	0.18393022546847582
  (0, 23870)	0.17526535730535225
  (0, 36225)	0.06236706053863284
  :	:
  (9996, 33580)	0.09312661627607552
  (9996, 33167)	0.041717025404352806
  (9997, 27044)	0.28012024170252514
  (9997, 27045)	0.28012024170252514
  (9997, 4173)	0.26818603218492637
 

cosine simalarity

In [20]:
similarity = cosine_similarity(feature_vectors, feature_vectors, dense_output=True)

In [21]:
print(similarity)

[[1.         0.05094557 0.40363534 ... 0.03779929 0.05451975 0.00961163]
 [0.05094557 1.         0.03304135 ... 0.03748506 0.02256622 0.00813003]
 [0.40363534 0.03304135 1.         ... 0.03603607 0.01129935 0.03249246]
 ...
 [0.03779929 0.03748506 0.03603607 ... 1.         0.01539596 0.01941468]
 [0.05451975 0.02256622 0.01129935 ... 0.01539596 1.         0.00349155]
 [0.00961163 0.00813003 0.03249246 ... 0.01941468 0.00349155 1.        ]]


In [22]:
print(similarity.shape)

(9998, 9998)


In [23]:
df.columns

Index(['Unnamed: 0', 'id', 'genre_ids', 'title', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [24]:
def get_movie_loc(name):
    try:
        # print(name.title())
        return df[df['title'] == name.title() ].index[0]
    except Exception as e:
        print(f'Error {name} not found, {e}')
        return None

In [25]:
def recommend(movie, k=5):
    if df['title'].str.contains(movie).any():
        try:
            idx = get_movie_loc(movie)
            print(idx)
            if idx != -1:
                sim_scores = list(enumerate(similarity[idx]))
                sim_scores.sort(key=lambda i:i[1], reverse=True)
                movie_idxs = [i[0] for i in sim_scores]
                return df.iloc[movie_idxs]['title'].head(k).tolist()
            else:
                return None
        except Exception as e:
            print("Error+>",e)
            return None
    else:
        print('movie not found')
        return None

In [26]:
get_movie_loc('the godfather')

0

In [27]:
recommend('Iron Man')

1013


['Iron Man',
 'Iron Man 2',
 'Iron Man 3',
 'Battle for the Planet of the Apes',
 'Accident Man']