# DATA PREPROCESSING

In [None]:
import numpy as np
import pandas as pd

In [None]:
movie = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movie.head(1)  #using empty brackets will gives us the entire dataset

In [None]:
credit.head(1)

In [None]:
credit.head(1)['cast'].values

In [None]:
credit.head(1)['crew'].values

In [None]:
movie.merge(credit,on='title')

In [None]:
movie.shape

In [None]:
credit.shape

# Reassign the movie variable so that we know only have the merged dataset

In [None]:
movie = movie.merge(credit,on='title')

In [None]:
movie.shape

In [None]:
movie.head(1)

## For a content based recommender system we basically need to create tags and now we will look which coloumns would help us in creating tags and we will omit the remaining coloumns

### important coloumns for a recommeder system (coloumns with more numerical values can be overlooked)
### - genres
### -id (for fetching movie posters at the end of this project)
### -keywords
### -title only (not original_title)
### -overview
### -cast
### -crew

In [None]:
movie = movie[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movie['original_language'].value_counts() #mostly movies are in english so we didn't need this coloumn

In [None]:
movie.info()   # helps you truncate the view

In [None]:
movie.head()

### now from this data frame we will create a new data frame that has movie title, movie id and movie tags . To make the tags coloumn we need to combine overview , genres , keywords , cast , crew . For example take an overview of the movie and add the genre terms to it and similarly adding keywords the genre and overview . Take only top 3 cast . In crew we will add only the name of the director to make the tags coloumns . This would give us a paragrapgh we need to create these paragraphs. Obviously some data is in a weird format we will do some reformatting and remove duplicates and clean the data.

In [None]:
movie.isnull().sum()  #looking for missing data

In [None]:
movie.dropna(inplace=True) #remove the rows with missing data

In [None]:
movie.isnull().sum()

In [None]:
movie.duplicated().sum()    #looking for duplicate data

### Changing data formats

In [None]:
movie.iloc[0].genres

### Create a helper funtion to extract genres only

In [None]:
import ast  #for converting the genre string to a list
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')


In [None]:
def convert(obj):
    L=[]              #For appending the names in a list
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L
    

In [None]:
movie['genres'].apply(convert)

In [None]:
movie['genres'] = movie['genres'].apply(convert)

In [None]:
movie.head(1)

In [None]:
movie['genres']

In [None]:
movie['keywords'].apply(convert)

In [None]:
movie['keywords'] = movie['keywords'].apply(convert)

In [None]:
movie.head()

### Now for Cast and Crew we would make a similar function but we need only top 3 cast members

In [None]:
movie['cast'][0] #first movie cast

In [None]:
def convert3(obj):
    L=[]              #For appending the names in a list
    counter = 0       #for only top 3 cast
    for i in ast.literal_eval(obj):
        if counter != 3:
           L.append(i['name'])
           counter +=1
        else:
             break
        
    return L
    

In [None]:
movie['cast'].apply(convert3)

In [None]:
movie['cast'] = movie['cast'].apply(convert3)

In [None]:
movie.head()

In [None]:
movie['crew'][0]

### Make another similar function for fetching the director we need to extract only the part where 'job' is director

In [None]:
def fetch_director(obj):
    L=[]              #For appending the names in a list
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
           L.append(i['name'])
           break
    return L
    

In [None]:
movie['crew'].apply(fetch_director)

In [None]:
movie['crew'] = movie['crew'].apply(fetch_director)

In [None]:
movie.head()

In [None]:
movie['overview'][0]

In [None]:
movie['overview'].apply(lambda x:x.split()) #converting the overview string into a list

In [None]:
movie['overview'] = movie['overview'].apply(lambda x:x.split())

In [None]:
movie.head()

### Now we will combine these lists or append these lists and then we will get our desired paragraphs after converting it back into the string format
### We need to also remove spaces between the words that is bcz for eg if we want to watch movies with Sam Worthington in it we will need to create separate tags for both words but their can be more sam's and recommendation can go wrong. So we will convert 'Sam Worthington' into 'SamWorthington'

In [None]:
movie['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movie['genres'] = movie['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movie['keywords'] = movie['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movie['cast'] = movie['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movie['crew'] = movie['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movie.head()

### Now we create a column in the movie dataset named as "tags" which will be the concatenation of the desired coloumns

In [None]:
movie['tags'] = movie['overview'] + movie['genres'] + movie['keywords'] + movie['cast'] + movie['crew']  #creating tags coloumn

In [None]:
movie.head()

In [None]:
movie['tags'][0]

## Now we dont need other coloumns so we can remove them

In [None]:
new_df = movie[['movie_id','title','tags']]

In [None]:
new_df

### Now Converting the list back to a string

In [None]:
new_df['tags'].apply(lambda x:" ".join(x))  #Joining them with spaces

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df['tags'][0]

In [None]:
new_df

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower()) #changing into lower case

In [None]:
new_df.head()

In [233]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


# VECTORIZATION

# Our goal is to make a website where the user will enter a single movie name and we will have to tell atleast 5 similar movies

# To know which movies are similar we need to find the similarities between the movie tags but we cannot use any mathematical formulas since they are strings so we use vectorization

# Every Tag would need to be converted into a vector in the vector space there would be 5000 vectors so we will need to choose the closest vectors to each other in the X-Y Plane
## so basically we need to convert text into vectors known as TEXT VECTORIZATION
## Technique that we would be using is "bag of words" , other techniques are tfidf and word2vec

In [None]:
pip install scikit-learn

In [None]:
import sklearn

### The number 5000 words are the most occuring and recurring 5000 words in the movie tags and 4806 total movies

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #Used for text vectorization (its a class)
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
 cv.fit_transform(new_df['tags']).toarray().shape  #5000 words and 4806 are movies

### Now each movie is in Vector Form

In [None]:
vectors

In [None]:
vectors[0]         #first movie (if the most occuring words are present in the tags section of this movie we will get a non zero value otherwise only zeros

In [None]:
cv.get_feature_names_out()  #most occuring 5000 words display

In [None]:
len(cv.get_feature_names_out())

### Displaying those 5000 words

In [None]:

def display_features(cv):
    # Get feature names
    feature_names = cv.get_feature_names_out()
    
    # Create a DataFrame to display the features
    features_df = pd.DataFrame({'Feature': feature_names})
    
    # Display the DataFrame
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(features_df)


In [232]:
display_features(cv)

                         Feature
0                            000
1                            007
2                             10
3                            100
4                             11
5                             12
6                             13
7                             14
8                             15
9                             16
10                            17
11                          17th
12                            18
13                          18th
14                   18thcenturi
15                            19
16                          1910
17                          1920
18                          1930
19                          1940
20                          1944
21                          1950
22                         1950s
23                          1960
24                         1960s
25                          1970
26                         1970s
27                          1971
28                          1974
29        

### Now these words have some similar words that should be removed such as action/actions are same and acitivity and activities are same we will replace them with one single word

### We will use a technique called stunning

### For this purpose we need to install a NLP Library

In [None]:
pip install nltk

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer 
ps = PorterStemmer()

In [None]:
def stem(text):
    y=[]        #for creating a list

    for i in text.split(): #convert string to list then we will stem every word
        y.append(ps.stem(i))

    return " ".join(y)     #converting list back to string
    
    

In [None]:
new_df['tags'].apply(stem)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
ps.stem('loving')

In [None]:
new_df['tags'][0]

In [None]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron')

In [231]:
new_df['tags']

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

## We wont calculate euclidian distance bw the vectors bcz it calc tip to tip distance and fails for high dimensional data in our case it is 5000 dimensions so we will use cosine distance (which is basically angle between two vectors and the smaller the angle the more similar are the movies)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity #cosine similarity lies between 0 and 1 and have inverse rel with dist 0 means no similarity and vice versa

In [None]:
cosine_similarity(vectors)

In [None]:
cosine_similarity(vectors).shape

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity

In [None]:
similarity[0] #the distance of first movie with 4806 movies

In [None]:
similarity[0].shape

### Now fetching movie index


In [229]:
new_df['title'] == 'Avatar'

0        True
1       False
2       False
3       False
4       False
        ...  
4804    False
4805    False
4806    False
4807    False
4808    False
Name: title, Length: 4806, dtype: bool

In [230]:
new_df[new_df['title'] == 'Batman Begins'].index[0]

119

# Main Function that would give 5 similar movies if we input 1 movie

In [None]:
# def recommend(movie):
#     movie_index = new_df[new_df['title'] == movie].index[0]
#     distances = similarity[movie_index]                     #Now we need to sort the distances

    
    
#     return

In [None]:
sorted(similarity[0] , reverse = True)     #gives similarity in descending order and we need to pick top 5 but sorting is changing the index position

## We need to hold or freeze the movie indexes as sorting is changing it

In [221]:
sorted(list(enumerate(similarity[0])), reverse = True)  #for retaining the movie indexes even after sorting

[(4805, 0.0),
 (4804, 0.0),
 (4803, 0.04499212706658476),
 (4802, 0.046829290579084706),
 (4801, 0.019252140716412975),
 (4800, 0.0),
 (4799, 0.052631578947368425),
 (4798, 0.04223886030955117),
 (4797, 0.0),
 (4796, 0.0),
 (4795, 0.0),
 (4794, 0.0),
 (4793, 0.05407380704358751),
 (4792, 0.0),
 (4791, 0.0),
 (4790, 0.0582716546748065),
 (4789, 0.060833032924035954),
 (4788, 0.0),
 (4787, 0.019672236884115842),
 (4786, 0.0),
 (4785, 0.019672236884115842),
 (4784, 0.043355498476206004),
 (4783, 0.0),
 (4782, 0.025649458802128853),
 (4781, 0.0582716546748065),
 (4780, 0.0),
 (4779, 0.0),
 (4778, 0.0),
 (4777, 0.10814761408717502),
 (4776, 0.0),
 (4775, 0.05647824947249051),
 (4774, 0.0),
 (4773, 0.029617443887954616),
 (4772, 0.023918243661746996),
 (4771, 0.039344473768231684),
 (4770, 0.0),
 (4769, 0.0),
 (4768, 0.0),
 (4767, 0.03627381250550058),
 (4766, 0.017195436249022724),
 (4765, 0.0),
 (4764, 0.0),
 (4763, 0.0),
 (4762, 0.0),
 (4761, 0.02742042485535409),
 (4760, 0.02533472959690

In [222]:
list(enumerate(similarity[0]))

[(0, 1.0000000000000002),
 (1, 0.08346223261119858),
 (2, 0.08603090020146065),
 (3, 0.0734718358370645),
 (4, 0.1892994097121204),
 (5, 0.10838874619051501),
 (6, 0.04024218182927669),
 (7, 0.14673479641335554),
 (8, 0.05923488777590923),
 (9, 0.0967301666813349),
 (10, 0.10259783520851541),
 (11, 0.09464970485606021),
 (12, 0.09037128496931669),
 (13, 0.04499212706658476),
 (14, 0.12824729401064427),
 (15, 0.06282808624375433),
 (16, 0.07894736842105264),
 (17, 0.13977653617040256),
 (18, 0.09493290614465533),
 (19, 0.0830812984794528),
 (20, 0.058038100008800934),
 (21, 0.10968169942141635),
 (22, 0.0662266178532522),
 (23, 0.08740748201220976),
 (24, 0.0533380747062665),
 (25, 0.05101627678885769),
 (26, 0.15389675281277312),
 (27, 0.18693292157876878),
 (28, 0.116543309349613),
 (29, 0.065033247714309),
 (30, 0.06684847767323797),
 (31, 0.15907119074394446),
 (32, 0.08520286456846099),
 (33, 0.09733285267845754),
 (34, 0.0),
 (35, 0.09933992677987831),
 (36, 0.17316974359835272),


In [223]:
sorted(list(enumerate(similarity[0])), reverse = True,key = lambda x:x[1])  #for retaining the movie indexes even after sorting

[(0, 1.0000000000000002),
 (1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574),
 (582, 0.24511108480187255),
 (1202, 0.24455799402225925),
 (1192, 0.2367785320221084),
 (61, 0.23179316248638276),
 (778, 0.23174488732966073),
 (4046, 0.2278389747471728),
 (1916, 0.2252817784447915),
 (2782, 0.21853668936906193),
 (172, 0.21239769762143662),
 (972, 0.2108663315950723),
 (322, 0.2105263157894737),
 (2329, 0.20443988269091456),
 (3606, 0.20437977982832192),
 (260, 0.20395079136182276),
 (151, 0.2029530274475215),
 (4190, 0.2029530274475215),
 (1440, 0.20277677641345318),
 (74, 0.2024645717996314),
 (1087, 0.2020475485519274),
 (3671, 0.1979082783981174),
 (973, 0.19767387315371682),
 (577, 0.1976738731537168),
 (47, 0.19672236884115843),
 (2969, 0.1925214071641298),
 (942, 0.19134594929397597),
 (495, 0.19088542889273336),
 (1199, 0.19088542889273336),
 (305, 0.19007487139298027),
 (4, 0.189299409712

In [224]:
sorted(list(enumerate(similarity[0])), reverse = True,key = lambda x:x[1])[1:6] #for getting 1st 5 movies

[(1214, 0.28676966733820225),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [225]:
new_df.iloc[1214]

movie_id                                                  440
title                             Aliens vs Predator: Requiem
tags        a sequel to 2004' alien vs. predator, the icon...
Name: 1214, dtype: object

In [226]:
new_df.iloc[1214].title

'Aliens vs Predator: Requiem'

In [227]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]                     #Now we need to sort the distances
    movies_list = sorted(list(enumerate(distances)), reverse = True,key = lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)
        # print(i[0]) replacing this with the above line and running the cell below will give us the index values only of the similar movies
    
    
    

In [228]:
recommend('Avatar') #The recommended 5 movies

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


# Now to send the movies list to our website code we would use pickle library

In [None]:
import pickle

In [None]:
pickle.dump(new_df,open('movie.pkl','wb'))  #wb means write binary mode and we are creating a pickle movie file that is getting
                                            # saved in this project directory than go to your pc , copy that file and paste it
                                            # in pycharm

In [None]:
new_df['title'].values

# there was a error loading dataframe as it is in the app code using pickle so we will convert dataframe to a dictionary

In [None]:
new_df.to_dict()

In [None]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))