In [1]:
import pandas as pd
import numpy as np

In [3]:
import pandas as pd

# Try different parsing options
try:
    # Attempt to read the CSV file with default options
    movies = pd.read_csv('dataset.csv')
    print("CSV file read successfully with default options.")
except Exception as e:
    print("Error reading CSV file with default options:", e)

try:
    # Attempt to read the CSV file with tab delimiter
    movies_tab = pd.read_csv('dataset.csv', delimiter='\t')
    print("CSV file read successfully with tab delimiter.")
except Exception as e:
    print("Error reading CSV file with tab delimiter:", e)

try:
    # Attempt to read the CSV file with semicolon delimiter
    movies_semicolon = pd.read_csv('dataset.csv', delimiter=';')
    print("CSV file read successfully with semicolon delimiter.")
except Exception as e:
    print("Error reading CSV file with semicolon delimiter:", e)

try:
    # Attempt to read the CSV file with different quoting behavior
    movies_no_quote = pd.read_csv('dataset.csv', quoting=csv.QUOTE_NONE)
    print("CSV file read successfully with quoting=csv.QUOTE_NONE.")
except Exception as e:
    print("Error reading CSV file with quoting=csv.QUOTE_NONE:", e)


CSV file read successfully with default options.
Error reading CSV file with tab delimiter: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.

Error reading CSV file with semicolon delimiter: Error tokenizing data. C error: Expected 1 fields in line 50, saw 2

Error reading CSV file with quoting=csv.QUOTE_NONE: name 'csv' is not defined


In [5]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [6]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [8]:
movies['tags']=movies['genre']+movies['overview']

In [9]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,tags
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,"Drama,CrimeIn the continuing saga of the Corle..."


In [11]:
new_df=movies[['id','title','genre','overview','tags']]

In [13]:
new_df=new_df.drop(columns=['genre','overview'])

In [14]:
new_df.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer(max_features=1000, stop_words='english')

In [17]:
vec = cv.fit_transform(new_df['tags'].values.astype('U')).toarray()

In [18]:
vec.shape

(10000, 1000)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
sim = cosine_similarity(vec)

In [21]:
sim

array([[1.        , 0.11009638, 0.09534626, ..., 0.1254363 , 0.11396058,
        0.05025189],
       [0.11009638, 1.        , 0.17320508, ..., 0.        , 0.        ,
        0.        ],
       [0.09534626, 0.17320508, 1.        , ..., 0.0438529 , 0.05976143,
        0.        ],
       ...,
       [0.1254363 , 0.        , 0.0438529 , ..., 1.        , 0.05241424,
        0.04622502],
       [0.11396058, 0.        , 0.05976143, ..., 0.05241424, 1.        ,
        0.06299408],
       [0.05025189, 0.        , 0.        , ..., 0.04622502, 0.06299408,
        1.        ]])

In [22]:
new_df[new_df['title']=='The Shawshank Redemption']

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."


In [24]:
dist= sorted(list(enumerate(sim[0])),reverse=True,key=lambda vec:vec[1])

In [26]:
for i in dist[0:5]:
  print(new_df.iloc[i[0]].title)

The Shawshank Redemption
Anything for Her
The Getaway
Cool Hand Luke
Undisputed III: Redemption


In [27]:
def recommend(movies):
  index=new_df[new_df['title']==movies].index[0]
  distance=sorted(list(enumerate(sim[index])),reverse=True,key=lambda vec:vec[1])
  for i in distance[0:5]:
    print(new_df.iloc[i[0]].title)

In [28]:
recommend("Iron Man")

Iron Man
Marvel One-Shot: Item 47
Mazinger Z: Infinity
Star Wars: The Force Awakens
Insurgent
