In [73]:
!pip install convokit
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [74]:
corpus.print_summary_stats()


Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [75]:
!pip install scikit-surprise



In [76]:
import pandas as pd
from surprise import Dataset,Reader,SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import matplotlib.pyplot as plt

In [77]:
#Basic statistics
print("Number of conversations:", len(corpus.conversations))
print("Number of users:", len(corpus.speakers))
print("Number of utterances:", len(corpus.utterances))

Number of conversations: 83097
Number of users: 9035
Number of utterances: 304713


In [78]:
#Information about conversations
for convo in corpus.iter_conversations():
    print("Conversation ID:", convo.get_id())
    print("Metadata:", convo.meta)
    print("Number of utterances in conversation:", len(convo.get_utterance_ids()))
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Number of utterances in conversation: 2

Conversation ID: L355817
Metadata: ConvoKitMeta({'movie_idx': 'm123', 'movie_name': 'lost highway', 'release_year': '1997', 'rating': '7.60', 'votes': '42998', 'genre': "['drama', 'horror', 'mystery', 'thriller']"})
Number of utterances in conversation: 8

Conversation ID: L355856
Metadata: ConvoKitMeta({'movie_idx': 'm123', 'movie_name': 'lost highway', 'release_year': '1997', 'rating': '7.60', 'votes': '42998', 'genre': "['drama', 'horror', 'mystery', 'thriller']"})
Number of utterances in conversation: 5

Conversation ID: L355851
Metadata: ConvoKitMeta({'movie_idx': 'm123', 'movie_name': 'lost highway', 'release_year': '1997', 'rating': '7.60', 'votes': '42998', 'genre': "['drama', 'horror', 'mystery', 'thriller']"})
Number of utterances in conversation: 4

Conversation ID: L355847
Metadata: ConvoKitMeta({'movie_idx': 'm123', 'movie_name': 'lost highway', 'release_year': '1997',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Conversation ID: L655326
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 2

Conversation ID: L655322
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 4

Conversation ID: L655320
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thriller']"})
Number of utterances in conversation: 2

Conversation ID: L654917
Metadata: ConvoKitMeta({'movie_idx': 'm606', 'movie_name': 'wild things', 'release_year': '1998', 'rating': '6.60', 'votes': '40523', 'genre': "['crime', 'mystery', 'thrill

In [79]:
import pandas as pd

# Extracting conversation metadata
data = [
    {
        'Conversation ID': convo.get_id(),
        'Movie Index': convo.meta['movie_idx'],
        'Movie Name': convo.meta['movie_name'],
        'Release Year': convo.meta['release_year'],
        'Rating': convo.meta['rating'],
        'Votes': convo.meta['votes'],
        'Genres': convo.meta['genre'],
        'Number of Utterances': len(convo.get_utterance_ids())
    }
    for convo in corpus.iter_conversations()
]

# Creating a DataFrame from the extracted data
df = pd.DataFrame(data)

In [80]:
#The DataFrame
print(df)


      Conversation ID Movie Index                  Movie Name Release Year  \
0               L1044          m0  10 things i hate about you         1999   
1                L984          m0  10 things i hate about you         1999   
2                L924          m0  10 things i hate about you         1999   
3                L870          m0  10 things i hate about you         1999   
4                L866          m0  10 things i hate about you         1999   
...               ...         ...                         ...          ...   
83092         L666324        m616                   zulu dawn         1979   
83093         L666262        m616                   zulu dawn         1979   
83094         L666520        m616                   zulu dawn         1979   
83095         L666369        m616                   zulu dawn         1979   
83096         L666256        m616                   zulu dawn         1979   

      Rating  Votes                                            

In [81]:
# Creating a Surprise Reader object to specify the rating scale
reader = Reader(rating_scale=(1, 10))


In [82]:
data = Dataset.load_from_df(df[['Conversation ID', 'Movie Name', 'Rating']], reader)

In [84]:
# Spliting the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.3, random_state=42)

In [85]:
# Initializing model
model=SVD()

In [86]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a15e075aa10>

In [87]:
predictions=model.test(testset)

In [88]:
rmse=accuracy.rmse(predictions)
print(f'RMSE:{rmse:.2f}')

RMSE: 0.0701
RMSE:0.07


    An RMSE of 0.07 indicates that, on average, the model's predicted ratings are quite close to the actual ratings.

In [93]:
# Defining a function to get movie recommendations based on the SVD model
def get_movie_recommendations(movie_name, num_recommendations=10):
    all_movie_names = df['Movie Name'].unique()

    movie_ratings = [
        (other_movie, model.predict(movie_name, other_movie).est)
        for other_movie in all_movie_names
        if other_movie != movie_name
    ]

    movie_ratings.sort(key=lambda x: x[1], reverse=True)

    top_recommendations = movie_ratings[:num_recommendations]

    return top_recommendations


In [94]:
#Getting movie recommendations for a movie
movie_to_recommend = 'xxx'
recommendations = get_movie_recommendations(movie_to_recommend)
print(f"Top 10 movie recommendations for '{movie_to_recommend}':")
for idx, (other_movie, rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie Name: '{other_movie}', Estimated Rating: {rating:.2f}")

Top 10 movie recommendations for 'xxx':
1. Movie Name: 'neuromancer', Estimated Rating: 9.16
2. Movie Name: 'the godfather', Estimated Rating: 9.11
3. Movie Name: 'the godfather: part ii', Estimated Rating: 8.91
4. Movie Name: 'one flew over the cuckoo's nest', Estimated Rating: 8.83
5. Movie Name: 'fight club', Estimated Rating: 8.73
6. Movie Name: 'schindler's list', Estimated Rating: 8.73
7. Movie Name: 'casablanca', Estimated Rating: 8.72
8. Movie Name: 'star wars', Estimated Rating: 8.72
9. Movie Name: 'goodfellas', Estimated Rating: 8.66
10. Movie Name: 'rear window', Estimated Rating: 8.64


In [91]:
#Recommendation for a User
def get_movie_recommendations_for_user(user_id, num_recommendations=3):
    user_conversations = df[df['Conversation ID'].str.contains(user_id, case=False, na=False)]
    movies_interacted = user_conversations['Movie Name'].unique()
    movies_not_interacted = df[~df['Movie Name'].isin(movies_interacted)]['Movie Name'].unique()
    movie_ratings = [
        (movie_name, model.predict(user_id, movie_name).est)
        for movie_name in movies_not_interacted
    ]

    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = movie_ratings[:num_recommendations]

    return top_recommendations


In [95]:

#Getting movie recommendations for a user with user_id
user_id_to_recommend = 'L666256'
recommendations = get_movie_recommendations_for_user(user_id_to_recommend)
print(f"Top 3 movie recommendations for user '{user_id_to_recommend}':")
for idx, (movie_name, rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie Name: '{movie_name}', Estimated Rating: {rating:.2f}")


Top 3 movie recommendations for user 'L666256':
1. Movie Name: 'neuromancer', Estimated Rating: 9.16
2. Movie Name: 'the godfather', Estimated Rating: 9.11
3. Movie Name: 'the godfather: part ii', Estimated Rating: 8.91
