<a href="https://colab.research.google.com/github/aisyahkhns/Encryptix/blob/main/Recsys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
file_path = '/content/drive/My Drive/dataset1/netflix.csv'  # Update with your file path

In [None]:
file_path

'/content/drive/My Drive/dataset1/netflix.csv'

In [None]:
data = pd.read_csv(file_path)

In [None]:
data

Unnamed: 0,Name,Year,Age Rating,Duration,Category,Unnamed: 6.1,Unnamed: 6,Unnamed: 7,Unnamed: 8,Age Rating.1,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,The Sea Beast,2022.0,7+,1h 59m,Kids & Family Movies,,,,,7+,...,13+,,18+,,16+,,All,,Total Count of Duration,Total Sum of Year
1,Sonic the Hedgehog,2020.0,7+,1h 41m,Kids & Family Movies,,,,Duration,Count of Duration,...,Count of Duration,Sum of Year,Count of Duration,Sum of Year,Count of Duration,Sum of Year,Count of Duration,Sum of Year,,
2,Tom and Jerry,2021.0,7+,1h 41m,Kids & Family Movies,,,,1h 30m,7,...,4,8076,2,4040,,,,,13,26211
3,We Can Be Heroes,2020.0,7+,1h 40m,Kids & Family Movies,,,,1h 46m,7,...,2,4043,,,3,6054,,,12,24236
4,STAND BY ME Doraemon 2,2020.0,7+,1h 37m,Kids & Family Movies,,,,1h 59m,4,...,5,10092,1,2016,2,4032,,,12,24177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2724,Mom,2017.0,16+,2h 18m,Dramas,,,,,,...,,,,,,,,,,
2725,The Legend of Bhagat Singh,2002.0,16+,2h 35m,Movies Based on Real Life,,,,,,...,,,,,,,,,,
2726,Article 15,2019.0,16+,2h 5m,Dramas,,,,,,...,,,,,,,,,,
2727,We Are Family,2010.0,13+,1h 58m,Dramas,,,,,,...,,,,,,,,,,


In [None]:
# Step 1: Drop irrelevant columns
columns_to_drop = [col for col in data.columns if 'Unnamed' in col or data[col].isnull().sum() / len(data) > 0.5]
data_cleaned = data.drop(columns=columns_to_drop)

# Step 2: Convert "Duration" to numeric (total minutes)
def duration_to_minutes(duration):
    if isinstance(duration, str):
        parts = duration.split('h')
        hours = int(parts[0]) if parts[0].isdigit() else 0
        minutes = int(parts[1].replace('m', '').strip()) if len(parts) > 1 and parts[1].strip().replace('m', '').isdigit() else 0
        return hours * 60 + minutes
    return None

data_cleaned['Duration_Minutes'] = data_cleaned['Duration'].apply(duration_to_minutes)

# Step 3: Handle missing or irrelevant rows
data_cleaned = data_cleaned.dropna(subset=['Name', 'Year', 'Category', 'Duration_Minutes'])

# Step 4: Keep relevant columns
relevant_columns = ['Name', 'Year', 'Age Rating', 'Duration_Minutes', 'Category']
data_final = data_cleaned[relevant_columns]

# Save or inspect cleaned dataset
data_final.to_csv('cleaned_netflix.csv', index=False)
print(data_final.head())

                     Name    Year Age Rating  Duration_Minutes  \
0           The Sea Beast  2022.0         7+             119.0   
1      Sonic the Hedgehog  2020.0         7+             101.0   
2           Tom and Jerry  2021.0         7+             101.0   
3        We Can Be Heroes  2020.0         7+             100.0   
4  STAND BY ME Doraemon 2  2020.0         7+              97.0   

               Category  
0  Kids & Family Movies  
1  Kids & Family Movies  
2  Kids & Family Movies  
3  Kids & Family Movies  
4  Kids & Family Movies  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [None]:
# Step 5: Content-Based Filtering
# Combine categorical features for TF-IDF
data_final['Combined_Features'] = data_final['Category'] + ' ' + data_final['Age Rating']

# Apply TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data_final['Combined_Features'])

# Normalize numeric feature (Duration_Minutes)
scaler = MinMaxScaler()
data_final['Duration_Scaled'] = scaler.fit_transform(data_final[['Duration_Minutes']])

# Calculate content-based similarity
content_similarity = cosine_similarity(tfidf_matrix.toarray(), tfidf_matrix.toarray())

# Step 6: Collaborative Filtering
# Simulate user-item interaction matrix
np.random.seed(42)
user_item_matrix = np.random.randint(1, 6, size=(10, len(data_final)))  # Simulated data for 10 users

# Apply SVD
svd_model = TruncatedSVD(n_components=5, random_state=42)
user_factors = svd_model.fit_transform(user_item_matrix)
item_factors = svd_model.components_.T

# Generate collaborative filtering similarity
collaborative_similarity = np.dot(user_factors, item_factors.T)

# Step 7: Hybrid Recommendation
# Combine content-based and collaborative similarity with equal weights
num_movies = content_similarity.shape[0]  # Get the number of movies
collaborative_similarity_movie = np.zeros((num_movies, num_movies))

movie_id = 0  # You can change this to any valid movie index

# Get top 5 similar movies
similar_movies_indices = hybrid_similarity[movie_id].argsort()[:-6:-1]  # Exclude the movie itself

# Get the recommended movies
recommended_movies = data_final.iloc[similar_movies_indices[1:]] # Exclude the movie itself


# Calculate average collaborative similarity between movies based on user preferences
for i in range(num_movies):
    for j in range(i + 1, num_movies):  # Iterate through unique movie pairs
        # Average similarity across all users who rated both movies
        common_users = np.where((user_item_matrix[:, i] > 0) & (user_item_matrix[:, j] > 0))[0]
        if len(common_users) > 0:
            collaborative_similarity_movie[i, j] = collaborative_similarity_movie[j, i] = collaborative_similarity[common_users, i].mean() #np.mean(collaborative_similarity[common_users, i] + collaborative_similarity[common_users, j]) /2# collaborative_similarity[common_users, i] @ collaborative_similarity[common_users, j] / (np.linalg.norm(collaborative_similarity[common_users, i]) * np.linalg.norm(collaborative_similarity[common_users, j]))   # Assuming collaborative_similarity is user-movie

# Combine content-based and collaborative similarity with equal weights
hybrid_similarity = 0.5 * content_similarity + 0.5 * collaborative_similarity_movie


# Output the top recommendations
print("Top recommendations for:", data_final.iloc[movie_id]['Name'])
print(recommended_movies[['Name', 'Category', 'Age Rating', 'Duration_Minutes']])

Top recommendations for: The Sea Beast
                                    Name              Category Age Rating  \
71    Riverdance: The Animated Adventure  Kids & Family Movies         7+   
115  Cloudy with a Chance of Meatballs 2  Kids & Family Movies         7+   
114                        DJ Cinderella  Kids & Family Movies         7+   
197                      Back of the Net  Kids & Family Movies         7+   

     Duration_Minutes  
71               93.0  
115              94.0  
114              96.0  
197              86.0  
