In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
df = pd.read_csv("imdb-movies-dataset.csv")

In [4]:
df

Unnamed: 0,Poster,Title,Year,Certificate,Duration (min),Genre,Rating,Metascore,Director,Cast,Votes,Description,Review Count,Review Title,Review,preprocessed_description
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023.0,R,115.0,"[Comedy, Drama, Romance]",6.4,67.0,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a...",solène single mom begins unexpected romance ha...
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023.0,PG-13,145.0,"[Action, Adventure, Sci-Fi]",7.3,66.0,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a...",many years reign caesar young ape goes journey...
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023.0,PG-13,97.0,"[Biography, Comedy, History]",5.5,42.0,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",333,not funny,Pretty much the worst criticism you can lay on...,michigan business rivals kellogg post compete ...
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023.0,PG-13,126.0,"[Action, Comedy, Drama]",7.3,73.0,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...,stuntman must find missing star blockbuster film
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023.0,R,131.0,"[Drama, Romance, Sport]",7.7,82.0,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...",194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...,tashi former tennis prodigy turned coach turne...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,https://m.media-amazon.com/images/M/MV5BMzg5MW...,The Greatest Show on Earth,2020.0,U,152.0,"[Drama, Family, Romance]",6.5,76.0,Cecil B. DeMille,"James Stewart, Charlton Heston, Betty Hutton, ...",16078,"The dramatic lives of trapeze artists, a clown...",128,"Hey, doesn't anyone remember Last Emperor?",It constantly amazes me that people carp that ...,dramatic lives trapeze artists clown elephant ...
9996,https://m.media-amazon.com/images/M/MV5BYzA0ZG...,Berserk: Ougon Jidai-hen I - Haou no Tamago,2020.0,,76.0,"[Animation, Action, Adventure]",7.5,,Toshiyuki Kubooka,"Hiroaki Iwanaga, Carrie Keranen, Takahiro Saku...",14300,A lone sellsword named Guts gets recruited int...,12,Masterfully directed climatic epic saga,Few stories can capture your mind and soul in ...,lone sellsword named guts gets recruited merce...
9997,https://m.media-amazon.com/images/M/MV5BM2U1Mj...,Is-slottet,2020.0,,78.0,"[Mystery, Drama]",6.5,,Per Blom,"Line Storesund, Hilde Nyeggen Martinsen, Meret...",740,A couple of twelve-year-old Norwegian girls st...,4,Beautiful Film,"This film might not be to everyone's taste, it...",couple norwegian girls struggle intense taboo ...
9998,https://m.media-amazon.com/images/M/MV5BMTAwOD...,Loving Pablo,2020.0,A,123.0,"[Biography, Crime, Drama]",6.4,42.0,Fernando León de Aranoa,"Javier Bardem, Penélope Cruz, Peter Sarsgaard,...",22447,A journalist strikes up a romantic relationshi...,84,That film should be in Spanish,Why anyone (the director?) made Spanish actors...,journalist strikes romantic relationship notor...


In [None]:
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df['Title'].iloc[movie_indices]

In [None]:
def get_title_from_imdb_link(imdb_link):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    response = requests.get(imdb_link, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('h1')
        if title_tag:
            title = title_tag.text.strip()
            return title
    return None

In [None]:
imdb_link = input('Please paste your IMDb link here: ')


movie_title = get_title_from_imdb_link(imdb_link)
print(f"Extracted Title: {movie_title}")


if movie_title:
    recommendations = get_recommendations(movie_title)
    print(f"Movies recommended for '{movie_title}':")
    for movie in recommendations:
        print(movie)
else:
    print("Invalid IMDb link or movie not found in the dataset.")


In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'Genre', 'Description']]

def get_title_from_imdb_link(imdb_link):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = requests.get(imdb_link, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('h1')
        if title_tag:
            return title_tag.text.strip()
    except requests.RequestException as e:
        print(f"Error fetching IMDb link: {e}")
    return None

def search_imdb_for_movie(movie_title):
    print(movie_title)
    api_url = f"http://www.omdbapi.com/?i={movie_title}&apikey=7d9eb382"
    response = requests.get(api_url)
    print(response)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            return {
                'Title': movie_data.get('Title', 'N/A'),
                'Description': movie_data.get('Plot', 'N/A'),
                'IMDb Link': f"https://www.imdb.com/title/{movie_data.get('imdbID', '')}"
            }
    return None

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    movie_title = get_title_from_imdb_link(imdb_link)
    
    if movie_title:
        print(f"Extracted Title: {movie_title}")
        recommendations = get_recommendations(movie_title)
        print(f"Movies recommended for '{movie_title}':")
        for idx, row in recommendations.iterrows():
            movie_info = search_imdb_for_movie(row['Title'])
            if movie_info:
                print(f"Title: {movie_info['Title']}\n"
                      f"Description: {movie_info['Description']}\n"
                      f"IMDb Link: {movie_info['IMDb Link']}\n")
            else:
                print(f"Title: {row['Title']}\nDescription: {row['Description']}\nIMDb Link: N/A\n")
    else:
        print("Invalid IMDb link or movie not found in the dataset.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0144084/?ref_=nv_sr_srsg_0_tt_8_nm_0_q_american%2520ps


Extracted Title: American Psycho
Movies recommended for 'American Psycho':
Maniac
<Response [200]>
Title: Maniac
Description: A psychopathic man goes on a killing and mutilation spree in New York City.
IMDb Link: N/A

Orphan: First Kill
<Response [200]>
Title: Orphan: First Kill
Description: After orchestrating a brilliant escape from an Estonian psychiatric facility, Esther travels to America by impersonating the missing daughter of a wealthy family.
IMDb Link: N/A

La tua presenza nuda!
<Response [200]>
Title: La tua presenza nuda!
Description: A wealthy author's second wife begins to suspect that her 12-year-old stepson may have murdered his mother, who mysteriously died in a bathtub accident.
IMDb Link: N/A



In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices]

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    movie_title = get_title_from_imdb_link(imdb_link)
    
    if movie_title:
        print(f"Extracted Title: {movie_title}")
        recommendations = get_recommendations(movie_title)
        print(f"Movies recommended for '{movie_title}':")
        for idx, row in recommendations.iterrows():
            print(f"Title: {row['Title']}\n"
                  f"Description: {row['Description']}\n"
                  f"Director: {row['Director']}\n"
                  f"Poster Link: {row['Poster']}\n")
    else:
        print("Invalid IMDb link or movie not found in the dataset.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0144084/?ref_=nv_sr_srsg_0_tt_8_nm_0_q_american%2520ps


NameError: name 'get_title_from_imdb_link' is not defined

In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

b

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    movie_title = get_title_from_imdb_link(imdb_link)
    
    if movie_title:
        print(f"Extracted Title: {movie_title}")
        recommendations = get_recommendations(movie_title)['Title']
        print(f"Movies recommended for '{movie_title}':")
        for movie_title in recommendations:
            movie_info = df[df['Title'] == movie_title].iloc[0]
            print(f"Title: {movie_info['Title']}\n"
                  f"Description: {movie_info['Description']}\n"
                  f"Director: {movie_info['Director']}\n"
                  f"Poster Link: {movie_info['Poster']}\n")
    else:
        print("Invalid IMDb link or movie not found in the dataset.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0073650/?ref_=nv_sr_srsg_0_tt_3_nm_5_q_salo


Extracted Title: Salò, or the 120 Days of Sodom


TypeError: list indices must be integers or slices, not str

In [10]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices]

get_recommendations("Salò, or the 120 Days of Sodom")

['Movie not found in the dataset.']

In [None]:
df[df["Title"] == "Salò, or the 120 Days of Sodom"]

In [37]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        fill_missing_info(df, get_imdb_id_from_link(title))
        title = df.at[0, 'Title']
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    return df.iloc[movie_indices][['Title', 'Genre', 'Description']]

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None


def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            df.at[0, 'Title'] = movie_data.get('Title', 'N/A')
            df.at[0, 'Description'] = movie_data.get('Plot', 'N/A')
            df.at[0, 'Genre'] = movie_data.get('Genre', 'N/A').split(', ')
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        movie_info = {'IMDb Link': imdb_link, 'Title': '', 'Description': '', 'Genre': ''}
        df = pd.DataFrame([movie_info])
        fill_missing_info(df, imdb_id)
        print("Movie Title:", df.at[0, 'Title'])
        print("DataFrame:", df)
        movie_title = df.at[0, 'Title']
        recommendations = get_recommendations(movie_title, df=df)
        if isinstance(recommendations, list):
            print(recommendations[0])
        else:
            print(f"Movies recommended for '{movie_title}':")
            for idx, row in recommendations.iterrows():
                print(f"Title: {row['Title']}\n"
                      f"Description: {row['Description']}\n"
                      f"Genre: {row['Genre']}\n")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0073650/?ref_=nv_sr_srsg_0_tt_3_nm_5_q_salo


Movie Title: Salò, or the 120 Days of Sodom
DataFrame:                                            IMDb Link  \
0  https://www.imdb.com/title/tt0073650/?ref_=nv_...   

                            Title  \
0  Salò, or the 120 Days of Sodom   

                                         Description    Genre  
0  In World War II Italy, four fascist libertines...  [Drama]  
[6699, 4628, 7088]


IndexError: positional indexers are out-of-bounds

In [38]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        fill_missing_info(df, get_imdb_id_from_link(title))
        title = df.at[0, 'Title']
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices

In [39]:
get_recommendations("Salò, or the 120 Days of Sodom")


NameError: name 'get_imdb_id_from_link' is not defined

In [40]:
df.iloc[11000]

IndexError: single positional indexer is out-of-bounds

In [41]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
api_url = f"http://www.omdbapi.com/?i=tt0073650&apikey=7d9eb382"
response = requests.get(api_url, headers=headers)
if response.status_code == 200:
    movie_data = response.json()

In [42]:
movie_data

{'Title': 'Salò, or the 120 Days of Sodom',
 'Year': '1975',
 'Rated': 'TV-MA',
 'Released': '10 Jan 1976',
 'Runtime': '117 min',
 'Genre': 'Drama',
 'Director': 'Pier Paolo Pasolini',
 'Writer': 'Pier Paolo Pasolini, Sergio Citti, Pupi Avati',
 'Actors': 'Paolo Bonacelli, Giorgio Cataldi, Uberto Paolo Quintavalle',
 'Plot': 'In World War II Italy, four fascist libertines round up nine adolescent boys and girls and subject them to 120 days of physical, mental, and sexual torture.',
 'Language': 'Italian, French, German',
 'Country': 'Italy, France',
 'Awards': '1 win',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMzljYjk2YzAtZmM1Mi00MzI2LTgyMGEtODEyNmY1OGQ2YjNmXkEyXkFqcGdeQXVyMzU4ODM5Nw@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.8/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '71%'}],
 'Metascore': 'N/A',
 'imdbRating': '5.8',
 'imdbVotes': '65,787',
 'imdbID': 'tt0073650',
 'Type': 'movie',
 'DVD': 'N/A',
 'BoxOffice': 'N/A',
 'Production

In [43]:
df.iloc[[6699, 4628, 7088]][["Title","Genre","Description"]]

Unnamed: 0,Title,Genre,Description
6699,Elizabethtown,"[Comedy, Drama, Romance]",During a hometown memorial for his Kentucky-bo...
4628,Home Again,"[Comedy, Drama, Romance]",Life for a single mom in Los Angeles takes an ...
7088,Ehrengard: The Art of Seduction,"[Comedy, Drama, Romance]",When a self-appointed expert on love tries to ...


In [44]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genre'])
combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    if title not in df['Title'].values:
        fill_missing_info(df, get_imdb_id_from_link(title))
        title = df.at[0, 'Title']
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None


def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            df.at[0, 'Title'] = movie_data.get('Title', 'N/A')
            df.at[0, 'Description'] = movie_data.get('Plot', 'N/A')
            df.at[0, 'Genre'] = movie_data.get('Genre', 'N/A').split(', ')
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        movie_info = {'IMDb Link': imdb_link, 'Title': '', 'Description': '', 'Genre': ''}
        df = pd.DataFrame([movie_info])
        fill_missing_info(df, imdb_id)
        print("Movie Title:", df.at[0, 'Title'])
        print("DataFrame:", df)
        movie_title = df.at[0, 'Title']
        recommendations = get_recommendations(movie_title, df=df)
        if isinstance(recommendations, list):
            print(recommendations[0])
        else:
            print(f"Movies recommended for '{movie_title}':")
            for idx, row in recommendations.iterrows():
                print(f"Title: {row['Title']}\n"
                      f"Description: {row['Description']}\n"
                      f"Genre: {row['Genre']}\n")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0073650/?ref_=nv_sr_srsg_0_tt_3_nm_5_q_salo


Movie Title: Salò, or the 120 Days of Sodom
DataFrame:                                            IMDb Link  \
0  https://www.imdb.com/title/tt0073650/?ref_=nv_...   

                            Title  \
0  Salò, or the 120 Days of Sodom   

                                         Description    Genre  
0  In World War II Italy, four fascist libertines...  [Drama]  
6699


In [46]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction function
def compute_features(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(df['Genre'])
    combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
    return combined_features

combined_features = compute_features(df)
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim, df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'Genre', 'Description']]

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None

def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            new_movie = {
                'Title': movie_data.get('Title', 'N/A'),
                'Description': movie_data.get('Plot', 'N/A'),
                'Genre': movie_data.get('Genre', 'N/A').split(', '),
                'preprocessed_description': preprocess_text(movie_data.get('Plot', 'N/A'))
            }
            new_movie_df = pd.DataFrame([new_movie])
            df = pd.concat([df, new_movie_df], ignore_index=True)
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        global df, combined_features, cosine_sim
        df = fill_missing_info(df, imdb_id)
        if not df.empty:
            df['preprocessed_description'] = df['Description'].apply(preprocess_text)
            combined_features = compute_features(df)
            cosine_sim = cosine_similarity(combined_features, combined_features)
            movie_title = df.iloc[-1]['Title']
            print("Movie Title:", movie_title)
            print("DataFrame:", df.iloc[-1])
            recommendations = get_recommendations(movie_title, cosine_sim, df)
            print(f"Movies recommended for '{movie_title}':")
            for idx, row in recommendations.iterrows():
                print(f"Title: {row['Title']}\n"
                      f"Description: {row['Description']}\n"
                      f"Genre: {row['Genre']}\n")
        else:
            print("Failed to retrieve movie information from IMDb.")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0073650/?ref_=nv_sr_srsg_0_tt_3_nm_5_q_salo


Movie Title: Salò, or the 120 Days of Sodom
DataFrame: Poster                                                                    NaN
Title                                          Salò, or the 120 Days of Sodom
Year                                                                      NaN
Certificate                                                               NaN
Duration (min)                                                            NaN
Genre                                                                 [Drama]
Rating                                                                    NaN
Metascore                                                                 NaN
Director                                                                  NaN
Cast                                                                      NaN
Votes                                                                     NaN
Description                 In World War II Italy, four fascist libertines...
Review Co

In [53]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction function
def compute_features(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(df['Genre'])
    combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
    return combined_features

combined_features = compute_features(df)
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim, df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [score for score in sim_scores if score[0] != idx]  # Exclude the movie itself
    sim_scores = sim_scores[:3]
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'Genre', 'Description']]

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None

def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            new_movie = {
                'Title': movie_data.get('Title', 'N/A'),
                'Description': movie_data.get('Plot', 'N/A'),
                'Genre': movie_data.get('Genre', 'N/A').split(', '),
                'preprocessed_description': preprocess_text(movie_data.get('Plot', 'N/A'))
            }
            new_movie_df = pd.DataFrame([new_movie])
            df = pd.concat([df, new_movie_df], ignore_index=True)
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        global df, combined_features, cosine_sim
        df = fill_missing_info(df, imdb_id)
        if not df.empty:
            df['preprocessed_description'] = df['Description'].apply(preprocess_text)
            combined_features = compute_features(df)
            cosine_sim = cosine_similarity(combined_features, combined_features)
            movie_title = df.iloc[-1]['Title']
            print("Movie Title:", movie_title)
            print("DataFrame:", df.iloc[-1])
            recommendations = get_recommendations(movie_title, cosine_sim, df)
            print(f"Movies recommended for '{movie_title}':")
            for idx, row in recommendations.iterrows():
                print(f"Title: {row['Title']}\n"
                      f"Description: {row['Description']}\n"
                      f"Genre: {row['Genre']}\n")
        else:
            print("Failed to retrieve movie information from IMDb.")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0144084/?ref_=nv_sr_srsg_0_tt_8_nm_0_q_american%2520ps


Movie Title: American Psycho
DataFrame: Poster                                                                    NaN
Title                                                         American Psycho
Year                                                                      NaN
Certificate                                                               NaN
Duration (min)                                                            NaN
Genre                                                  [Crime, Drama, Horror]
Rating                                                                    NaN
Metascore                                                                 NaN
Director                                                                  NaN
Cast                                                                      NaN
Votes                                                                     NaN
Description                 A wealthy New York City investment banking exe...
Review Count            

In [58]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction function
def compute_features(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(df['Genre'])
    combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
    return combined_features

combined_features = compute_features(df)
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim, df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:5]  # Get the second to fourth highest scores
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'Genre', 'Description']]

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None

def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            new_movie = {
                'Title': movie_data.get('Title', 'N/A'),
                'Description': movie_data.get('Plot', 'N/A'),
                'Genre': movie_data.get('Genre', 'N/A').split(', '),
                'preprocessed_description': preprocess_text(movie_data.get('Plot', 'N/A'))
            }
            new_movie_df = pd.DataFrame([new_movie])
            df = pd.concat([df, new_movie_df], ignore_index=True)
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        global df, combined_features, cosine_sim
        df = fill_missing_info(df, imdb_id)
        if not df.empty:
            df['preprocessed_description'] = df['Description'].apply(preprocess_text)
            combined_features = compute_features(df)
            cosine_sim = cosine_similarity(combined_features, combined_features)
            movie_title = df.iloc[-1]['Title']
            print("Movie Title:", movie_title)
            print("DataFrame:", df.iloc[-1])
            recommendations = get_recommendations(movie_title, cosine_sim, df)
            print(f"Movies recommended for '{movie_title}':")
            recommended_titles = []
            for idx, row in recommendations.iterrows():
                if row['Title'] != movie_title and row['Title'] not in recommended_titles:
                    recommended_titles.append(row['Title'])
                    print(f"Title: {row['Title']}\n"
                          f"Description: {row['Description']}\n"
                          f"Genre: {row['Genre']}\n")
                    if len(recommended_titles) == 4:
                        break
        else:
            print("Failed to retrieve movie information from IMDb.")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0144084/?ref_=nv_sr_srsg_0_tt_8_nm_0_q_american%2520ps


Movie Title: American Psycho
DataFrame: Poster                                                                    NaN
Title                                                         American Psycho
Year                                                                      NaN
Certificate                                                               NaN
Duration (min)                                                            NaN
Genre                                                  [Crime, Drama, Horror]
Rating                                                                    NaN
Metascore                                                                 NaN
Director                                                                  NaN
Cast                                                                      NaN
Votes                                                                     NaN
Description                 A wealthy New York City investment banking exe...
Review Count            

In [59]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# Load and preprocess dataset
df = pd.read_csv("imdb-movies-dataset.csv")
df['Genre'] = df['Genre'].fillna('')
df['Genre'] = df['Genre'].astype(str)
df['Genre'] = df['Genre'].apply(lambda x: x.split(', '))

def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['preprocessed_description'] = df['Description'].apply(preprocess_text)

# Feature extraction function
def compute_features(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_description'])
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(df['Genre'])
    combined_features = np.hstack((tfidf_matrix.toarray(), genre_matrix))
    return combined_features

combined_features = compute_features(df)
cosine_sim = cosine_similarity(combined_features, combined_features)

def get_recommendations(title, cosine_sim, df):
    if title not in df['Title'].values:
        return ["Movie not found in the dataset."]
    
    idx = df[df['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:5]  # Get the second to fourth highest scores
    movie_indices = [i[0] for i in sim_scores]
    return df.iloc[movie_indices][['Title', 'Genre', 'Description', 'Director', 'Poster']]

def extract_imdb_id(imdb_link):
    imdb_id = re.search(r'tt\d+', imdb_link)
    if imdb_id:
        return imdb_id.group(0)
    return None

def fill_missing_info(df, imdb_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    api_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey=7d9eb382"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        movie_data = response.json()
        if movie_data['Response'] == 'True':
            new_movie = {
                'Title': movie_data.get('Title', 'N/A'),
                'Description': movie_data.get('Plot', 'N/A'),
                'Genre': movie_data.get('Genre', 'N/A').split(', '),
                'preprocessed_description': preprocess_text(movie_data.get('Plot', 'N/A'))
            }
            new_movie_df = pd.DataFrame([new_movie])
            df = pd.concat([df, new_movie_df], ignore_index=True)
    return df

def main():
    imdb_link = input('Please paste your IMDb link here: ')
    imdb_id = extract_imdb_id(imdb_link)
    if imdb_id:
        global df, combined_features, cosine_sim
        df = fill_missing_info(df, imdb_id)
        if not df.empty:
            df['preprocessed_description'] = df['Description'].apply(preprocess_text)
            combined_features = compute_features(df)
            cosine_sim = cosine_similarity(combined_features, combined_features)
            movie_title = df.iloc[-1]['Title']
            print("Movie Title:", movie_title)
            print("DataFrame:", df.iloc[-1])
            recommendations = get_recommendations(movie_title, cosine_sim, df)
            print(f"Movies recommended for '{movie_title}':")
            recommended_titles = []
            for idx, row in recommendations.iterrows():
                if row['Title'] != movie_title and row['Title'] not in recommended_titles:
                    recommended_titles.append(row['Title'])
                    print(f"Title: {row['Title']}\n"
                          f"Description: {row['Description']}\n"
                          f"Genre: {row['Genre']}\n"
                          f"Director: {row['Director']}\n"
                          f"Poster: {row['Poster']}\n")
                    if len(recommended_titles) == 4:
                        break
        else:
            print("Failed to retrieve movie information from IMDb.")
    else:
        print("Invalid IMDb link. Please provide a valid IMDb link.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Please paste your IMDb link here:  https://www.imdb.com/title/tt0144084/?ref_=nv_sr_srsg_0_tt_8_nm_0_q_american%2520ps


Movie Title: American Psycho
DataFrame: Poster                                                                    NaN
Title                                                         American Psycho
Year                                                                      NaN
Certificate                                                               NaN
Duration (min)                                                            NaN
Genre                                                  [Crime, Drama, Horror]
Rating                                                                    NaN
Metascore                                                                 NaN
Director                                                                  NaN
Cast                                                                      NaN
Votes                                                                     NaN
Description                 A wealthy New York City investment banking exe...
Review Count            