In [4]:

#importing necessary libraries
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
!pip install neo4j




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
#loading the data
genome_scores_data = pd.read_csv(r"C:\Users\ADHISH S\Desktop\Datasets\ml-25m\ml-25m\genome-scores.csv") 
movies_data = pd.read_csv(r"C:\Users\ADHISH S\Desktop\Datasets\ml-25m\ml-25m\movies.csv") 
ratings_data = pd.read_csv(r"C:\Users\ADHISH S\Desktop\Datasets\ml-25m\ml-25m\ratings.csv")

In [7]:
#data exloration
genome_scores_data.head(10)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075
5,1,6,0.14675
6,1,7,0.0635
7,1,8,0.20375
8,1,9,0.202
9,1,10,0.03075


In [8]:
genome_scores_data.columns

Index(['movieId', 'tagId', 'relevance'], dtype='object')

In [9]:
movies_data.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [10]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [11]:

# Pivot the data to create mov_tag_df
scores_pivot = genome_scores_data.pivot_table(index=['movieId'], columns=['tagId'], values='relevance').reset_index()

scores_pivot.head()


tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [12]:


# Joining the two DataFrames on the 'movieId' column
combined_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left')

# Filling null values with a suitable default value 
combined_df = combined_df.fillna(0)

# Dropping columns that are not used (e.g., 'title', 'genres')
columns_to_drop = ['title', 'genres']
combined_df = combined_df.drop(columns=columns_to_drop)

combined_df.head()



Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [13]:

# Splitting the 'genres' column and create genre columns
genre_columns = movies_data['genres'].str.get_dummies(sep='|')

mov_genres_df = pd.concat([movies_data, genre_columns], axis=1)

# Drop the original 'genres' column
mov_genres_df.drop(['title','genres'], axis = 1, inplace=True)
mov_genres_df.head()



Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:


# Define a function to extract the year from the 'title' field
def extract_year(title):
    import re
    year_match = re.search(r'\(\d{4}\)', title)
    if year_match:
        year = int(year_match.group()[1:5])
        return year
    else:
        return None

# Apply the function to create a 'year' column in 'movies_data'
movies_data['year'] = movies_data['title'].apply(extract_year)

# Merge 'movies_data' with 'ratings_data' to get rating information
merged_data = pd.merge(movies_data, ratings_data, on='movieId', how='inner')




In [15]:
merged_data = merged_data.drop(columns=['genres', 'userId', 'rating', 'timestamp'])


In [16]:
merged_data.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995.0
1,1,Toy Story (1995),1995.0
2,1,Toy Story (1995),1995.0
3,1,Toy Story (1995),1995.0
4,1,Toy Story (1995),1995.0


In [17]:
# Define a function to set the "year_group"
def set_year_group(year):
    if year < 1900:
        return 0
    elif 1900 <= year <= 1975:
        return 1
    elif 1976 <= year <= 1995:
        return 2
    elif 1996 <= year <= 2003:
        return 3
    elif 2004 <= year <= 2009:
        return 4
    elif 2010 <= year:
        return 5
    else:
        return 0
    
merged_data['year_group'] = merged_data.apply(lambda x: set_year_group(x['year']), axis=1)
#no need title and year fields
merged_data.drop(['title','year'], axis = 1, inplace=True)

In [18]:
movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
movies_rat.columns = ['movieId','rating_counts', 'rating_mean']
movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,57309,3.893708
1,2,24228,3.251527
2,3,11804,3.142028
3,4,2523,2.853547
4,5,11714,3.058434


In [19]:
#defining function to group rating counts
def set_rating_group(rating_counts):
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10): return 1
    elif (11 <= rating_counts <= 100): return 2
    elif (101 <= rating_counts <= 1000): return 3
    elif (1001 <= rating_counts <= 5000): return 4
    elif (5001 <= rating_counts): return 5
    else: return 0
movies_rat['rating_group'] = movies_rat.apply(lambda x: set_rating_group(x['rating_counts']), axis=1)
#no need rating_counts field
movies_rat.drop('rating_counts', axis = 1, inplace=True)
mov_rating_df = merged_data.merge(movies_rat, left_on='movieId', right_on='movieId', how='left')
mov_rating_df = mov_rating_df.fillna(0)


In [20]:
mov_rating_df.head()

Unnamed: 0,movieId,year_group,rating_mean,rating_group
0,1,2,3.893708,5
1,1,2,3.893708,5
2,1,2,3.893708,5
3,1,2,3.893708,5
4,1,2,3.893708,5


In [21]:
mov_genres_df.head()


Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
combined_df.head()



Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [23]:
combined_df = combined_df.set_index('movieId')
mov_genres_df = mov_genres_df.set_index('movieId')
mov_rating_df = mov_rating_df.set_index('movieId')

In [24]:
# Get 15% of the data
sample_size = int(0.15 * len(combined_df))

# Select the first 15% of the data
combined_df_sample = combined_df.head(sample_size)
mov_genres_df_sample = mov_genres_df.head(sample_size)
mov_rating_df_sample = mov_rating_df.head(sample_size)



#cosine similarity for mov_tag_df
cos_tag = cosine_similarity(combined_df_sample.values)*0.5
#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df_sample.values)*0.25
#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df_sample.values)*0.25
#mix
cos = cos_tag+cos_genres+cos_rating


In [25]:
cols = combined_df_sample.index.values
inx = combined_df_sample.index
cos_sim_df = pd.DataFrame(cos, columns=cols, index=inx)
cos_sim_df.head()


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,27800,27801,27802,27803,27805,27808,27811,27812,27815,27816
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.831725,0.674146,0.637076,0.686369,0.560834,0.655332,0.753187,0.515286,0.633894,...,0.735374,0.575723,0.578037,0.564405,0.329057,0.636987,0.329057,0.543698,0.593746,0.615282
2,0.831725,1.0,0.589896,0.572155,0.575346,0.509743,0.57457,0.834748,0.561706,0.66787,...,0.66592,0.564855,0.549356,0.500037,0.25,0.556929,0.25,0.509122,0.547729,0.604961
3,0.674146,0.589896,1.0,0.829642,0.853264,0.566662,0.886444,0.605838,0.565555,0.586682,...,0.566597,0.589296,0.626221,0.550958,0.5,0.828961,0.375,0.545596,0.554486,0.567383
4,0.637076,0.572155,0.829642,1.0,0.76434,0.553911,0.857609,0.629322,0.544953,0.551794,...,0.56349,0.56443,0.650616,0.713554,0.454124,0.906792,0.454124,0.547241,0.716125,0.637112
5,0.686369,0.575346,0.853264,0.76434,1.0,0.512084,0.803402,0.581541,0.534361,0.544867,...,0.527277,0.540635,0.574835,0.529196,0.426777,0.782594,0.426777,0.516374,0.548519,0.528083


In [26]:
def get_similar(movieId):
    df = cos_sim_df.loc[cos_sim_df.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
#create empty df
movies_similarity = pd.DataFrame(columns=['movieId','sim_moveId','relevance'])


In [27]:
for x in cos_sim_df.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))





In [28]:
movies_similarity.head()

Unnamed: 0,movieId,sim_moveId,relevance
4780,1,4886,0.978072
3021,1,3114,0.976056
2264,1,2355,0.946367
2203,1,2294,0.942616
6258,1,6377,0.938101


In [29]:
def movie_recommender(movieId):
    df = cos_sim_df.loc[cos_sim_df.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    df['sim_moveId'] = df['sim_moveId'].astype(int)
    sim_df = movies_data.merge(df, left_on='movieId', right_on='sim_moveId', how='inner'). \
                sort_values('relevance', axis=0, ascending=False). \
                loc[: , ['movieId_y','title','genres']]. \
                rename(columns={ 'movieId_y': "movieId" })
    return sim_df

In [30]:
#get recommendation for Toy Story
movie_recommender(1)

Unnamed: 0,movieId,title,genres
3,1,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
2,1,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
1,1,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
0,1,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
4,1,Finding Nemo (2003),Adventure|Animation|Children|Comedy


In [31]:
#get recommendation for Lock, Stock & Two Smoking Barrels
movie_recommender(2542)

Unnamed: 0,movieId,title,genres
3,2542,Snatch (2000),Comedy|Crime|Thriller
0,2542,Get Shorty (1995),Comedy|Crime|Thriller
4,2542,Two Hands (1999),Comedy|Crime|Thriller
1,2542,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,2542,Go (1999),Comedy|Crime


In [32]:
#get recommendation for Eternal Sunshine of the Spotless Mind
movie_recommender(7361)

Unnamed: 0,movieId,title,genres
0,7361,Open Your Eyes (Abre los ojos) (1997),Drama|Romance|Sci-Fi|Thriller
3,7361,Solaris (2002),Drama|Romance|Sci-Fi
2,7361,"Princess and the Warrior, The (Krieger und die...",Drama|Romance
1,7361,American Beauty (1999),Drama|Romance
4,7361,Before Sunset (2004),Drama|Romance


In [33]:
# Calculate the number of rows to select for each dataset (30% of its total number of rows)
sample_size_genome = int(0.1 * len(genome_scores_data))
sample_size_movies = int(0.1 * len(movies_data))
sample_size_ratings = int(0.1 * len(ratings_data))

# Getting the first 30% of the data from each dataset
genome_scores_data = genome_scores_data.head(sample_size_genome)
movies_data = movies_data.head(sample_size_movies)
ratings_data = ratings_data.head(sample_size_ratings)

In [34]:
#using ‘ratings.csv’ data and extract unique userIds.

users_df = pd.DataFrame(ratings_data['userId'].unique(), columns=['userId'])
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [35]:
#create movies_df
movies_df = movies_data.drop('genres', axis = 1)
#calculate mean of ratings for each movies
agg_rating_avg = ratings_data.groupby(['movieId']).agg({'rating': np.mean}).reset_index()
agg_rating_avg.columns = ['movieId', 'rating_mean']
#merge
movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')
movies_df.head()

Unnamed: 0,movieId,title,year,rating_mean
0,1,Toy Story (1995),1995.0,3.903468
1,2,Jumanji (1995),1995.0,3.273569
2,3,Grumpier Old Men (1995),1995.0,3.127759
3,4,Waiting to Exhale (1995),1995.0,2.95749
4,5,Father of the Bride Part II (1995),1995.0,3.0841


In [36]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"]
genres_df = pd.DataFrame(genres, columns=['genres'])
genres_df.head()

Unnamed: 0,genres
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


In [37]:
users_movies_df = ratings_data.drop('timestamp', axis = 1)
users_movies_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [38]:
movies_genres_df = movies_data.drop('title', axis = 1)


In [39]:
#defining a function to split genres field
def get_movie_genres(movieId):
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

In [40]:
#create empty df
movies_genres=pd.DataFrame(columns=['movieId','genres'])
for x in movies_genres_df['movieId'].tolist():
    movies_genres=movies_genres.append(get_movie_genres(x))
movies_genres.head()

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [41]:
#join to movies data to get genre information
user_genres_df = ratings_data.merge(movies_data, left_on='movieId', right_on='movieId', how='left')
#drop columns that will not be used
user_genres_df.drop(['movieId','rating','timestamp','title'], axis = 1, inplace=True)
user_genres_df.head()

Unnamed: 0,userId,genres,year
0,1,Comedy|Crime|Drama|Thriller,1994.0
1,1,Drama,1994.0
2,1,Drama,1993.0
3,1,Comedy|Drama|War,1995.0
4,1,Comedy|Musical|Romance,1952.0


In [42]:
def get_favorite_genre(userId):
    user = user_genres_df[user_genres_df['userId']==userId]
    genres = user['genres'].tolist()
    if not genres:
        return "No suggestion"
    else:
        movie_list = [b for a in [i.split('|') for i in genres] for b in a]
        counter = Counter(movie_list)
        return counter.most_common(1)[0][0]

In [43]:
user_genres_df.dropna(inplace=True)

In [44]:
#create empty df
users_genres = pd.DataFrame(columns=['userId','genre'])
for x in users_df['userId'].tolist():
    users_genres = users_genres.append(pd.DataFrame([[x,get_favorite_genre(x)]], columns=['userId','genre']))
users_genres.head()

Unnamed: 0,userId,genre
0,1,Drama
0,2,Drama
0,3,Drama
0,4,Comedy
0,5,Comedy


In [45]:
users_df.to_csv('users.csv', sep='|', header=True, index=False)
movies_df.to_csv('movies.csv', sep='|', header=True, index=False)
genres_df.to_csv('genres.csv', sep='|', header=True, index=False)
users_movies_df.to_csv('users_movies.csv', sep='|', header=True, index=False)
movies_genres.to_csv('movies_genres.csv', sep='|', header=True, index=False)
users_genres.to_csv('users_genres.csv', sep='|', header=True, index=False)
movies_similarity.to_csv('movies_similarity.csv', sep='|', header=True, index=False)

In [46]:

# Define a connection to the database
uri = "bolt://localhost:7687"  # Replace with your database URI
username = "neo4j"     # Replace with your username
password = "12345678"     # Replace with your password

# Create a connection to the database
driver = GraphDatabase.driver(uri, auth=(username, password))

In [47]:
session=driver.session()

In [49]:
cypher_query = """
LOAD CSV WITH HEADERS FROM "file:///users.csv" AS row
FIELDTERMINATOR '|'
CREATE (:Users {userId: row.userId});
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

  # This is added back by InteractiveShellApp.init_path()


In [50]:
q1='MATCH (n:Users) RETURN n LIMIT 10'
nodes=session.run(q1)
for node in nodes:
    print(node)

<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:0' labels=frozenset({'Users'}) properties={'userId': '1'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:1' labels=frozenset({'Users'}) properties={'userId': '2'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:2' labels=frozenset({'Users'}) properties={'userId': '3'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:3' labels=frozenset({'Users'}) properties={'userId': '4'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:4' labels=frozenset({'Users'}) properties={'userId': '5'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:5' labels=frozenset({'Users'}) properties={'userId': '6'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:6' labels=frozenset({'Users'}) properties={'userId': '7'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:7' labels=frozenset({'Users'}) properties={'

In [51]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///movies.csv' AS row
FIELDTERMINATOR '|'
CREATE (:Movies {movieId: row.movieId, title: row.title, rating_mean: row.rating_mean});
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

  # This is added back by InteractiveShellApp.init_path()


In [52]:
q1='MATCH (n:Movies) RETURN n LIMIT 10'
nodes1=session.run(q1)
for node in nodes1:
    print(node)

<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:16658' labels=frozenset({'Movies'}) properties={'rating_mean': '3.9034683068511873', 'movieId': '1', 'title': 'Toy Story (1995)'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:16659' labels=frozenset({'Movies'}) properties={'rating_mean': '3.273569370111157', 'movieId': '2', 'title': 'Jumanji (1995)'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:16660' labels=frozenset({'Movies'}) properties={'rating_mean': '3.127758913412564', 'movieId': '3', 'title': 'Grumpier Old Men (1995)'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:16661' labels=frozenset({'Movies'}) properties={'rating_mean': '2.95748987854251', 'movieId': '4', 'title': 'Waiting to Exhale (1995)'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:16662' labels=frozenset({'Movies'}) properties={'rating_mean': '3.084100418410042', 'movieId': '5', 'title': 'Father of the Brid

In [53]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///genres.csv' AS row
FIELDTERMINATOR '|'
CREATE (:Genres {genres: row.genres});
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

  # This is added back by InteractiveShellApp.init_path()


In [54]:
q1='MATCH (n:Genres) RETURN n LIMIT 25'
nodes2=session.run(q1)
for node in nodes2:
    print(node)

<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22900' labels=frozenset({'Genres'}) properties={'genres': 'Action'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22901' labels=frozenset({'Genres'}) properties={'genres': 'Adventure'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22902' labels=frozenset({'Genres'}) properties={'genres': 'Animation'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22903' labels=frozenset({'Genres'}) properties={'genres': 'Children'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22904' labels=frozenset({'Genres'}) properties={'genres': 'Comedy'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22905' labels=frozenset({'Genres'}) properties={'genres': 'Crime'}>>
<Record n=<Node element_id='4:f29a9674-2f1a-4369-a5d0-6589ae322efc:22906' labels=frozenset({'Genres'}) properties={'genres': 'Documentary'}>>
<Record n=<Node element_id='4

In [55]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///users_movies.csv' AS row
FIELDTERMINATOR '|'
MATCH (user:Users {userId: row.userId})
MATCH (movie:Movies {movieId: row.movieId})
MERGE (user)-[:WATCHED {rating: row.rating}]->(movie);
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

  del sys.path[0]


In [None]:
q1='MATCH p=()-[r:WATCHED]->() RETURN p LIMIT 25'
nodes3=session.run(q1)
for node in nodes3:
    print(node)

In [None]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///users_genres.csv' AS row
FIELDTERMINATOR '|'
MATCH (user:Users {userId: row.userId})
MATCH (genres:Genres {genres: row.genre})
MERGE (user)-[:FAVORITE]->(genres);
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

In [None]:
q1='MATCH p=()-[:FAVORITE]->() RETURN p LIMIT 25'
nodes4=session.run(q1)
for node in nodes4:
    print(node)

In [None]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///movies_genres.csv' AS row
FIELDTERMINATOR '|'
MATCH (movie:Movies {movieId: row.movieId})
MATCH (genres:Genres {genres: row.genres})
MERGE (movie)-[:GENRES]->(genres);
"""

# Function to execute the Cypher query
def execute_cypher_query(query):
    with GraphDatabase.driver(uri, auth=(username, password)) as driver:
        with driver.session() as session:
            result = session.write_transaction(lambda tx: tx.run(query))
            # Perform any necessary processing or error handling here
            return

# Execute the Cypher query
execute_cypher_query(cypher_query)

In [None]:
q1='MATCH p=()-[:GENRES]->() RETURN p LIMIT 25'
nodes5=session.run(q1)
for node in nodes5:
    print(node)

In [None]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'file:///movies_similarity.csv' AS row
FIELDTERMINATOR '|'
MATCH (movie1:Movies {movieId: row.movieId})
MATCH (movie2:Movies {movieId: row.sim_moveId})
MERGE (movie1)-[:SIMILAR {relevance: row.relevance}]->(movie2);
"""
execute_cypher_query(cypher_query)

In [None]:
q1='MATCH p=()-[:SIMILAR]->() RETURN p LIMIT 25'
nodes6=session.run(q1)
for node in nodes6:
    print(node)

In [None]:
# Define your Cypher query
q2 = "MATCH (u:Users)-[:WATCHED]->(m1:Movies) WHERE u.userId =~ '4' RETURN u.userId, m1.title, m1.rating_mean"

# Establish a session and run the query
with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) as driver:
    with driver.session() as session:
        result = session.run(q2)

        # Initialize a counter
        count = 0

        # Iterate through the result and print the first 5 records
        for record in result:
            if count < 5:
                print("user:",record["u.userId"], "Movie:",record["m1.title"],"rating:",record["m1.rating_mean"])
                count += 1
            else:
                break  # Exit the loop after printing 5 records


In [None]:

# Define your Cypher query
q4 = """
MATCH (u:Users)-[:WATCHED]->(m1:Movies)-[s:SIMILAR]->(m2:Movies)
WHERE u.userId =~ '4'
RETURN u.userId, m1.title, m2.title, m2.rating_mean
"""

# Establish a session and run the query
with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) as driver:
    with driver.session() as session:
        result = session.run(q4)
        count=0
        # Iterate through the result and print each record
        for record in result:
            if count < 5:
                print(record["u.userId"], record["m1.title"], record["m2.title"], record["m2.rating_mean"])
            else:
                break


In [None]:
# Define your Cypher query
q4 = """
MATCH (u:Users)-[:WATCHED]->(m1:Movies)-[s:SIMILAR]->(m2:Movies)
WHERE u.userId =~ '4'
RETURN u.userId, m1.title, m2.title, m2.rating_mean
"""

# Establish a session and run the query
with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) as driver:
    with driver.session() as session:
        result = session.run(q4)

        # Initialize a counter
        count = 0

        # Iterate through the result and print each record, up to 5 records
        for record in result:
            print("user:",record["u.userId"],"Movie:",record["m1.title"], record["m2.title"], record["m2.rating_mean"])
            count += 1

            # Limit to the first 5 records
            if count >= 5:
                break

In [None]:
# Define your Cypher query
q5 = """
MATCH (u1:Users)-[:WATCHED]->(m3:Movies)
WHERE u1.userId =~ '5'
WITH [i in m3.movieId | i] as movies
MATCH path = (u:Users)-[:WATCHED]->(m1:Movies)-[s:SIMILAR]->(m2:Movies),
(m2)-[:GENRES]->(g:Genres),
(u)-[:FAVORITE]->(g)
WHERE u.userId =~ '5' and not m2.movieId in movies
RETURN distinct u.userId as userId, g.genres as genres, 
m2.title as title, m2.rating_mean as rating
ORDER BY m2.rating_mean DESCENDING
LIMIT 5
"""

# Establish a session and run the query
with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) as driver:
    with driver.session() as session:
        result = session.run(q5)

        # Iterate through the result and print each record
        for record in result:
            print("user:",record["userId"],'genre:', record["genres"], 'movie:',record["title"],'rating:', record["rating"])
