In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("movies.csv")

# Step 1: Data Cleaning and Feature Engineering

# Extract year from the title
df['year'] = df['title'].str.extract(r'\((\d{4})\)').astype(float)
df['title'] = df['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

# One-hot encode the 'genres' column (split by '|')
genres_split = df['genres'].str.get_dummies(sep='|')
df = pd.concat([df, genres_split], axis=1)
df.drop(columns=['genres'], inplace=True)

# Encode movie IDs and titles
label_encoder = LabelEncoder()
df['movieId'] = label_encoder.fit_transform(df['movieId'])
df['title'] = label_encoder.fit_transform(df['title'])

# Step 2: Feature Scaling
scaler = StandardScaler()
if 'year' in df.columns:
    df['year_scaled'] = scaler.fit_transform(df[['year']].fillna(0))  # Ensure no NaN

# Step 3: Dimensionality Reduction using PCA
# Drop non-relevant columns for PCA and ensure no NaN values
pca_data = df.drop(columns=['title', 'movieId', 'year']).fillna(0)
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(pca_data)
df_pca = pd.DataFrame(reduced_features, columns=['feature_1', 'feature_2'])
df = pd.concat([df, df_pca], axis=1)

# Step 4: Data Splitting for Training and Testing
X = df.drop(columns=['movieId', 'title', 'year'])
y = df['year_scaled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Final Dataset Preview
print("Processed Dataset:")
print(df.head())

print("\nTraining and Testing Split:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

Processed Dataset:
   movieId  title    year  (no genres listed)  Action  Adventure  Animation  \
0        0   8634  1995.0                   0       0          1          1   
1        1   4513  1995.0                   0       0          1          0   
2        2   3560  1995.0                   0       0          0          0   
3        3   8978  1995.0                   0       0          0          0   
4        4   2893  1995.0                   0       0          0          0   

   Children  Comedy  Crime  ...  Musical  Mystery  Romance  Sci-Fi  Thriller  \
0         1       1      0  ...        0        0        0       0         0   
1         1       0      0  ...        0        0        0       0         0   
2         0       1      0  ...        0        0        1       0         0   
3         0       1      0  ...        0        0        1       0         0   
4         0       1      0  ...        0        0        0       0         0   

   War  Western  year_sca