In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\abhis\\Desktop\\MLProjects\\Movie Recommender'

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ContentBasedModelConfig:
    root_dir: Path
    movies_data: Path
    content_matrix: Path

In [3]:
from MovieRecommender.constants import *
from MovieRecommender.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_content_based_model_config(self) -> ContentBasedModelConfig:
        content_based_model = self.config.content_based_model

        create_directories([content_based_model.root_dir])

        content_based_model_config = ContentBasedModelConfig(
            root_dir = Path(content_based_model.root_dir),
            movies_data = Path(self.config.data_preparation.movies_data_path),
            content_matrix = Path(content_based_model.content_matrix)
            
        )

        return content_based_model_config

In [5]:
import pandas as pd
import json
import pickle
import numpy as np
from ast import literal_eval
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from MovieRecommender.constants import *
from MovieRecommender.logging import logger
from scipy.sparse import csr_matrix, save_npz

In [6]:
class ContentBasedModel:
    def __init__(self, config = ContentBasedModelConfig):
        self.config = config

    def get_df(self, data_path):
        """
        Read a CSV file into a pandas DataFrame.

        Args:
            data_path (str): File path of the CSV file.

        Returns:
            pandas.DataFrame: DataFrame containing the data from the CSV file.
        """
        df = pd.read_csv(data_path)
        return df

    def apply_literal_eval(self, data, features = ['genres','keywords']):
        for feature in features:
            data[feature] = data[feature].apply(literal_eval)

    
    def get_top_elements(self,data,features = ['genres','keywords']):
        for feature in features:
            data[feature] = data[feature].apply(lambda x: x[:3])


    def clean_director(self,data):
        data['director']=data['director'].apply(lambda x: x.lower().replace(' ','_'))
 

    def clean_top3(self, data,features = ['genres','keywords']):
        for feature in features:
            data[feature] = data[feature].apply(lambda x: [a.lower().replace(' ', '_') for a in x])


    def create_soup_column(self,data):
        data['soup'] = data['keywords'].apply(lambda x: ' '.join(x)) + ' ' + data['director'] + ' ' + data['genres'].apply(lambda x: ' '.join(x))

    
    
    ## STEPS 
    def load_data_from_db(self):
        self.movies_df = self.get_df(data_path = self.config.movies_data)
        self.movies_df  = self.movies_df[['genres', 'keywords','director']]
        logger.info( f'Data loaded from DB' )

    def process_data(self):
        self.apply_literal_eval(self.movies_df)
        self.get_top_elements(self.movies_df)
        self.clean_director(self.movies_df)
        self.clean_top3(self.movies_df)
        self.create_soup_column(self.movies_df)
        logger.info( f'Data processed' )

    def create_similarity_matrix(self):
        cv = CountVectorizer(stop_words='english')
        cv_matrix = cv.fit_transform(self.movies_df['soup']).astype('float16')
        cv_matrix = csr_matrix(cv_matrix, dtype=np.float16)
        cosine_sim = cosine_similarity(cv_matrix,dense_output=False)
        logger.info( f'Similarity matrix calculated' )
        return cosine_sim
    
    def save_similarity_matrix(self):
        cosine_sim = self.create_similarity_matrix()
        # pickle.dump(csr_matrix(cosine_sim), open(self.config.content_matrix,'wb'))
        save_npz(self.config.content_matrix,cosine_sim)
        logger.info( f'Similarity matrix saved in {self.config.content_matrix}' )
        

In [7]:
try:
    config = ConfigurationManager()
    content_based_model_config = config.get_content_based_model_config()
    content_based_model = ContentBasedModel(
                                config=content_based_model_config)
    content_based_model.load_data_from_db()
    content_based_model.process_data()
    content_based_model.save_similarity_matrix()  
except Exception as e:
    raise e

[2023-06-25 10:16:39,303: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-06-25 10:16:39,308: INFO: common: yaml file: params.yaml loaded successfully]
[2023-06-25 10:16:39,311: INFO: common: created directory at: artifacts]
[2023-06-25 10:16:39,313: INFO: common: created directory at: artifacts/content_based_model]
[2023-06-25 10:16:39,897: INFO: 3286974035: Data loaded from DB]
[2023-06-25 10:16:41,833: INFO: 3286974035: Data processed]
[2023-06-25 10:16:53,391: INFO: 3286974035: Similarity matrix calculated]
[2023-06-25 10:32:26,072: INFO: 3286974035: Similarity matrix saved in artifacts\content_based_model\content_matrix.npz]
