In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack, csr_matrix
import math

# IMDB and Movies merge

In [2]:
imdb = pd.read_csv("imdb_top_1000.csv")

In [3]:
imdb.shape

(1000, 16)

In [4]:
imdb.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [5]:
imdb.rename(columns = {'Series_Title':'title'}, inplace = True)

In [6]:
imdb.head()

Unnamed: 0,Poster_Link,title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [7]:
imdb = imdb[["title","Overview"]]

In [8]:
imdb.head()

Unnamed: 0,title,Overview
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...
1,The Godfather,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,The early life and career of Vito Corleone in ...
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...


In [9]:
moviess = pd.read_csv("movie.csv")

In [10]:
moviess.shape

(27278, 3)

In [11]:
moviess.title = moviess.title.str[:-7]

In [12]:
moviess.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [13]:
moviess = moviess.drop_duplicates(subset=['title'])

In [14]:
moviess.shape

(26216, 3)

In [15]:
Merged_movie_imdb = pd.merge(moviess,imdb,on="title")

In [16]:
Merged_movie_imdb.shape

(482, 4)

In [17]:
Merged_movie_imdb.head()

Unnamed: 0,movieId,title,genres,Overview
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...
1,6,Heat,Action|Crime|Thriller,A group of professional bank robbers start to ...
2,7,Sabrina,Comedy|Romance,A playboy becomes interested in the daughter o...
3,16,Casino,Crime|Drama,"A tale of greed, deception, money, power, and ..."
4,17,Sense and Sensibility,Drama|Romance,"Rich Mr. Dashwood dies, leaving his second wif..."


# Merge with Ratings now

In [18]:
ratings = pd.read_csv("rating.csv")

In [19]:
ratings.shape

(20000263, 4)

In [20]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [21]:
#ratings = ratings.drop_duplicates(subset=["userId",'movieId'], keep='last')

In [22]:
ratings.shape

(20000263, 4)

In [23]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


# Merge now

In [24]:
Final = pd.merge(ratings,Merged_movie_imdb,on="movieId")

In [25]:
Final.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Overview
0,1,223,4.0,2005-04-02 23:46:13,Clerks,Comedy,A day in the lives of two convenience clerks n...
1,3,223,5.0,1999-12-11 13:20:44,Clerks,Comedy,A day in the lives of two convenience clerks n...
2,16,223,3.0,2001-05-27 13:26:17,Clerks,Comedy,A day in the lives of two convenience clerks n...
3,21,223,3.0,2001-06-10 16:24:31,Clerks,Comedy,A day in the lives of two convenience clerks n...
4,23,223,5.0,1998-12-24 00:05:12,Clerks,Comedy,A day in the lives of two convenience clerks n...


In [26]:
Final.shape

(4437944, 7)

In [27]:
Final1 = Final.drop_duplicates(subset=["userId",'movieId'], keep='last')

In [28]:
Final1.shape

(4437943, 7)

In [29]:
Final1.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
Overview     0
dtype: int64

In [30]:
Final2 = Final1.sort_values(by=['userId'])

# So, Final2 is the data on which you can apply everything.!

In [31]:
Final2.to_csv('cleaned.csv', index=False)