# Movie Recommendation Project 
#### by Cosine similarity (content-based)

####

## Data Pre-Proccessing




#### Importing modules, reading data and deleting duplicates

In [95]:
import pandas as pd
import numpy as np

data = pd.read_csv('movies.csv')
data = data.drop_duplicates()
pd.set_option("display.max_columns", None)



##### Drop null values with no title/overview/release_date/poster_path


In [96]:
data = data.dropna(subset=['title', 'overview', 'release_date', 'poster_path'])

##### Keeping only Released and non pornographic content

In [97]:
data = data[data['status'] == 'Released']
data = data[data['adult'] == False]

##### Removing column which will have no impact on the vectors

In [98]:

data = data.drop(['homepage', 'status', 'backdrop_path', 'imdb_id', 'original_title', 'popularity'], axis = 1)


#### Filling important columns which will affect the vectors, will empty string instead of null value

In [99]:

data['genres'] = data['genres'].fillna("")
data['tagline'] = data['tagline'].fillna("")
data['production_companies'] = data['production_companies'].fillna("")
data['production_countries'] = data['production_countries'].fillna("")
data['spoken_languages'] = data['spoken_languages'].fillna("")
data['keywords'] = data['keywords'].fillna("")

##### Checking Null values if they are present still

In [100]:
data.isnull().sum()

id                      0
title                   0
vote_average            0
vote_count              0
release_date            0
revenue                 0
runtime                 0
adult                   0
budget                  0
original_language       0
overview                0
poster_path             0
tagline                 0
genres                  0
production_companies    0
production_countries    0
spoken_languages        0
keywords                0
dtype: int64

#### Creating the copy of top 15000 Movies

In [101]:
data_sorted = data.sort_values(by='vote_count', ascending=False) #first 15000 popular movies
data = data_sorted.iloc[:15000].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15000 entries, 0 to 15070
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    15000 non-null  int64  
 1   title                 15000 non-null  object 
 2   vote_average          15000 non-null  float64
 3   vote_count            15000 non-null  int64  
 4   release_date          15000 non-null  object 
 5   revenue               15000 non-null  int64  
 6   runtime               15000 non-null  int64  
 7   adult                 15000 non-null  bool   
 8   budget                15000 non-null  int64  
 9   original_language     15000 non-null  object 
 10  overview              15000 non-null  object 
 11  poster_path           15000 non-null  object 
 12  tagline               15000 non-null  object 
 13  genres                15000 non-null  object 
 14  production_companies  15000 non-null  object 
 15  production_countries  15

##### Converting sentences to list of words and removing special characters and spaces

In [102]:

#removing spaces
data["genres"] = data["genres"].apply(lambda x: x.replace(" ", ""))
data["tagline"] = data["tagline"].apply(lambda x: x.replace(" ", ""))
data["production_companies"] = data["production_companies"].apply(lambda x: x.replace(" ", ""))
data["production_countries"] = data["production_countries"].apply(lambda x: x.replace(" ", ""))
data["spoken_languages"] = data["spoken_languages"].apply(lambda x: x.replace(" ", ""))
data["keywords"] = data["keywords"].apply(lambda x: x.replace(" ", ""))


#spliting by ',' to create list and storing it in dataframe again
data["genres"] = data["genres"].apply(lambda x: x.split(","))
data["tagline"] = data["tagline"].apply(lambda x: x.split(","))
data["production_companies"] = data["production_companies"].apply(lambda x: x.split(","))
data["production_countries"] = data["production_countries"].apply(lambda x: x.split(","))
data["spoken_languages"] = data["spoken_languages"].apply(lambda x: x.split(","))
data["keywords"] = data["keywords"].apply(lambda x: x.split(","))
data['overview'] = data['overview'].str.replace(r'[,.!?"]', '', regex=True)
data["overview"] = data["overview"].apply(lambda x: x.split())

data["release_date"] = data["release_date"].apply(lambda x: int(x[0:4]))




#### Scaling the numerical values

In [103]:
# Using min-max scaling to scale between 0 and 1
data['vote_average'] = (data['vote_average'] - data['vote_average'].min()) / (data['vote_average'].max() - data['vote_average'].min())
data['vote_count'] = (data['vote_count'] - data['vote_count'].min()) / (data['vote_count'].max() - data['vote_count'].min())
data['release_date'] = (data['release_date'] - data['release_date'].min()) / (data['release_date'].max() - data['release_date'].min())
data['revenue'] = (data['revenue'] - data['revenue'].min()) / (data['revenue'].max() - data['revenue'].min())
data['runtime'] = (data['runtime'] - data['runtime'].min()) / (data['runtime'].max() - data['runtime'].min())
data['budget'] = (data['budget'] - data['budget'].min()) / (data['budget'].max() - data['budget'].min())




#### Converting singular value coluns to list too

In [104]:
data["original_language"] = data["original_language"].apply(lambda x: [x])
data["adult"] = data["adult"].apply(lambda x: [x])

#### Creating a new Column 'tags'

In [105]:
data["tags"] = data["overview"]+data["genres"]+data["tagline"]+data["production_companies"]+data["production_countries"]+data["spoken_languages"]+data["keywords"]+data["original_language"]

#### Checking a random Movie

In [106]:
data.iloc[-2]

id                                                                 387399
title                                                            We Go On
vote_average                                                      0.51859
vote_count                                                            0.0
release_date                                                     0.948148
revenue                                                               0.0
runtime                                                          0.152659
adult                                                             [False]
budget                                                                0.0
original_language                                                    [en]
overview                [Paralyzed, by, his, fear, of, dying, Miles, G...
poster_path                              /dBk6ol8q7kzlU2o7C5LmNmIXCQP.jpg
tagline                                   [Somedoorsshouldneverbeopened.]
genres                                

#### Dropping columns which were used in making tags column

In [107]:
data = data.drop(['keywords', 'adult', 'original_language', 'overview', 'genres'], axis = 1)

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15000 entries, 0 to 15070
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    15000 non-null  int64  
 1   title                 15000 non-null  object 
 2   vote_average          15000 non-null  float64
 3   vote_count            15000 non-null  float64
 4   release_date          15000 non-null  float64
 5   revenue               15000 non-null  float64
 6   runtime               15000 non-null  float64
 7   budget                15000 non-null  float64
 8   poster_path           15000 non-null  object 
 9   tagline               15000 non-null  object 
 10  production_companies  15000 non-null  object 
 11  production_countries  15000 non-null  object 
 12  spoken_languages      15000 non-null  object 
 13  tags                  15000 non-null  object 
dtypes: float64(6), int64(1), object(7)
memory usage: 1.7+ MB


#### Converting tags from list to string and converting it to all lower case

In [109]:
data["tags"] = data["tags"].apply(lambda x: " ".join(x))
data["tags"] = data["tags"].apply(lambda x: x.lower())

#### Using stem to convert words with same parent noun, as the same word
#### Eg. [Loved, Loves, Loving] will be converted to Love

In [110]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [111]:
data["tags"]=data["tags"].apply(stem)

#### Vectorizing using Term Frequency - Inverse Document Frequency to calculate word frequency

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack

#additional columns which will help sorting better movies from the similar movies
addon = data[['release_date','runtime', 'vote_average', 'revenue']]

#a total of 14000 words frequency to be calculated, stop words such as [the, is, are] are ignored
tfidf = TfidfVectorizer(max_features=14000, stop_words='english')
vec = tfidf.fit_transform(data['tags'])

#combining the matrix
combined_matrix = hstack([vec, addon])
combined_matrix = combined_matrix.tocsr()


In [113]:
combined_matrix.shape

(15000, 14004)

### Find similarity matrix using cosine distances

In [114]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(combined_matrix)



##### Writing the similarity matrix into a CSV file after sorting the rows in descending order and find top 20 similar movie indexes

In [115]:
import csv

# Open a new CSV file in write mode with UTF-8 encoding
with open('sorted_lists.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    for a in sim:
       
        ml = sorted(list(enumerate(a)), reverse=True, key=lambda x: x[1])[1:21]

        row = [index for index, similarity in ml]
        
        writer.writerow(row)

      


##### Writing Index into a CSV for accessing id, title, poster from the index, stored in similarity matrix 

In [116]:
data[['id','title','poster_path']].to_csv('movieindex.csv',index=False, header=True)