In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from gensim import corpora
from gensim.models import LdaModel
import json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Download the necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Read dataset
c_df=pd.read_csv('/content/drive/MyDrive/PBL6/data_processing/Data/data_1/content_based_filtering/credits_1.csv')
m_df=pd.read_csv('/content/drive/MyDrive/PBL6/data_processing/Data/data_1/content_based_filtering/movies_1.csv')

In [None]:
columns_to_drop = ['budget', 'homepage', 'original_language', 'popularity', 
                   'production_companies', 'production_countries', 'release_date', 
                   'revenue', 'runtime', 'spoken_languages', 'status']

# Check and remove 
m_df = m_df.drop([col for col in columns_to_drop if col in m_df.columns], axis=1)
# Renaming columns in c_df for better readability
c_df.columns = ['id','tittle','cast','crew']
# Merging the two datasets (c_df and m_df) based on the 'id' column
m_df= m_df.merge(c_df,on='id')

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters and convert text to lowercase
    text = re.sub(r'\W', ' ', text).lower().strip()

    # Split text into words and remove stopwords
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize each word in the list
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Return the processed text as a single string
    return ' '.join(words)

# Function to convert genres JSON string to readable text
def genres_to_text(genres_json):
    try:
        # Replace single quotes with double quotes if necessary (to ensure valid JSON format)
        genres_json = genres_json.replace("'", '"')
        
        # Try to parse the JSON string into a list
        data = json.loads(genres_json)
        
        # Extract and return the genre names as a comma-separated string
        return ", ".join([genre["name"] for genre in data])
    except json.JSONDecodeError:
        # If JSON parsing fails, return an empty string or an error message
        return ""

In [None]:
# Apply the 'genres_to_text' function to convert the 'genres' column to a text string
m_df['genres'] = m_df['genres'].apply(genres_to_text)

# Apply the 'genres_to_text' function to convert the 'keywords' column to a text string
m_df['keywords'] = m_df['keywords'].apply(genres_to_text)

# Fill missing values in the 'overview' column with an empty string
m_df['overview'] = m_df['overview'].fillna('')

# Apply text preprocessing to the 'genres' and 'keywords' columns
m_df['genres'] = m_df['genres'].apply(preprocess_text)
m_df['keywords'] = m_df['keywords'].apply(preprocess_text)

# Combine the 'overview', 'genres', and 'keywords' columns into a new 'combined' column
m_df['combined'] = m_df['overview'] + ' ' + m_df['genres'] + ' ' + m_df['keywords']

# Display the first 5 rows of the dataframe
m_df.head(5)

In [None]:
m_df.to_csv("processed_data.csv", index=False, encoding='utf-8')