<a href="https://colab.research.google.com/github/ashutosh-linux/aashu/blob/main/GEN_AI_COURSEPROJECT__36.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Conv2DTranspose, BatchNormalization, ReLU, Reshape, Dropout # Import Dropout here
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd
import cv2
import librosa
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, mean_squared_error, f1_score
from tensorflow.keras.optimizers import Adam # Import Adam

# ... (Rest of your code remains the same)

In [2]:

# 🔹 Load Dataset (Modify path as needed)
df = pd.read_csv("/content/drive/MyDrive/HACKATHON GEN AI/movies_youtube_sentiments.csv")

# 🔹 Handle Missing Values & Convert Everything to String
df.fillna("Unknown", inplace=True)

# 🔹 Combine relevant columns for better text input
df['combined_text'] = df[['name', 'genre', 'director', 'writer', 'star']].astype(str).agg(' '.join, axis=1)



  df.fillna("Unknown", inplace=True)


In [3]:
def preprocess_text(texts, vocab_size=5000, max_length=50):
    """Tokenize and pad text input"""
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)  # The input 'texts' is already a list
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, tokenizer

In [4]:
def extract_video_features(video_path, frame_rate=1):
    """Extract keyframes from video at a specified frame rate"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.resize(frame, (64, 64)))  # Resize for model input
    cap.release()
    return np.array(frames)



In [5]:
def extract_audio_features(audio_path, sr=22050, n_mfcc=13):
    """Extract MFCC features from audio"""
    y, sr = librosa.load(audio_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfcc, axis=1)



In [6]:
def build_text_encoder(vocab_size, embedding_dim=128, lstm_units=256):
    """Text Encoder for user input (Genre, Storyline, etc.)"""
    text_input = Input(shape=(None,))  # Input is sequence of word indices
    x = Embedding(vocab_size, embedding_dim)(text_input)
    x = LSTM(lstm_units)(x)
    return Model(text_input, x, name="Text_Encoder")



In [7]:
def build_video_generator(latent_dim=256):
    """Video Generator using GAN approach"""
    noise_input = Input(shape=(latent_dim,))
    x = Dense(8*8*512, activation='relu')(noise_input)
    x = Reshape((8, 8, 512))(x)
    x = Conv2DTranspose(256, kernel_size=3, strides=2, padding='same')(x) # Output: (16, 16, 256)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2DTranspose(128, kernel_size=3, strides=2, padding='same')(x) # Output: (32, 32, 128)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2DTranspose(64, kernel_size=3, strides=2, padding='same')(x)  # Output: (64, 64, 64)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    # Changed strides to 1 to get (64, 64, 3) output
    video_output = Conv2DTranspose(3, kernel_size=3, strides=1, padding='same', activation='tanh')(x)
    return Model(noise_input, video_output, name="Video_Generator")



In [8]:
def build_movie_generation_model(vocab_size, latent_dim=256):
    """Complete Model: Combines Text Input and Video Generation"""
    text_encoder = build_text_encoder(vocab_size)
    video_generator = build_video_generator(latent_dim)

    user_input = Input(shape=(None,))
    text_features = text_encoder(user_input)
    generated_video = video_generator(text_features)

    return Model(user_input, generated_video, name="Movie_Generation_Model")

In [9]:
# 🔹 Define Model Parameters
vocab_size = 5000  # Adjust based on dataset
latent_dim = 256
movie_model = build_movie_generation_model(vocab_size, latent_dim)
movie_model.summary()

# 🔹 Preprocess dataset with updated 'combined_text'
processed_text, tokenizer = preprocess_text(df['combined_text'])
df['processed_text'] = processed_text.tolist() # Convert 2D array to a list of lists

print("✅ Data preprocessing complete. Ready for training!")

# 🔹 Example: Generate fake labels for training (Modify based on real data)
X_train = np.array(df['processed_text'].tolist()) # Convert to NumPy array with correct dtype
y_train = np.random.rand(len(X_train), 64, 64, 3)  # Dummy video output data

# 🔹 Train the Model
movie_model.compile(optimizer='adam', loss='mse')
movie_model.fit(X_train, y_train, epochs=5, batch_size=16)

print("✅ Model training complete!")

# 🔹 Evaluation: Compute Metrics
y_pred = movie_model.predict(X_train)

# Flatten for comparison
y_train_flat = y_train.flatten()
y_pred_flat = y_pred.flatten()

accuracy = accuracy_score(y_train_flat.round(), y_pred_flat.round())
precision = precision_score(y_train_flat.round(), y_pred_flat.round(), average='macro')
mse = mean_squared_error(y_train_flat, y_pred_flat)
f1 = f1_score(y_train_flat.round(), y_pred_flat.round(), average='macro')

print(f"🔹 Accuracy: {accuracy}")
print(f"🔹 Precision: {precision}")
print(f"🔹 MSE: {mse}")
print(f"🔹 F1 Score: {f1}")

print("🎬 AI Movie Trailer Generation Pipeline Complete! 🚀")


✅ Data preprocessing complete. Ready for training!
Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 1s/step - loss: 0.2700
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 952ms/step - loss: 0.0873
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 955ms/step - loss: 0.0854
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 980ms/step - loss: 0.0847
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 983ms/step - loss: 0.0844
✅ Model training complete!
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 500ms/step
🔹 Accuracy: 0.5002381015507165
🔹 Precision: 0.5006407041904144
🔹 MSE: 0.08489458786530207
🔹 F1 Score: 0.4353056468473031
🎬 AI Movie Trailer Generation Pipeline Complete! 🚀


In [29]:
import os
import cv2
import numpy as np

def load_video_frames(video_path, frame_size=(64, 64), max_frames=100):
    frames = []
    count = 0
    for filename in os.listdir(video_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):  # Check for image files
            frame_path = os.path.join(video_path, filename)
            frame = cv2.imread(frame_path)
            if frame is not None:
                frame = cv2.resize(frame, frame_size)
                frame = frame / 255.0  # Normalize
                frames.append(frame)
                count += 1
            if count >= max_frames:
                break
    # Handle empty frames or error
    if not frames:
        print(f"Warning: Could not load frames from {video_path}. Returning placeholder.")
        return np.zeros((1, frame_size[0], frame_size[1], 3), dtype=np.float32)  # Return placeholder
    return np.array(frames)

# Assuming a folder named 'video_frames' containing subfolders for each movie
y_train = []
video_frames_dir = "/content/drive/MyDrive/HACKATHON GEN AI/TRAIN"
for movie_folder in os.listdir(video_frames_dir):
    movie_path = os.path.join(video_frames_dir, movie_folder)
    frames = load_video_frames(movie_path)
    if frames.shape[0] > 0:  # Ensure there are frames loaded
        y_train.append(frames)  # Add frames for this movie

y_train = np.array(y_train)  # Convert to NumPy array

# Check the shape of y_train
print("Shape of y_train:", y_train.shape)

Shape of y_train: (1500, 1, 64, 64, 3)


In [18]:
!pip install lpips # Install the missing 'lpips' package

Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=0.4.0->lpips)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=0.4.0->lpips)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=0.4.0->lpips)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=0.4.0->lpips)
  Downloading nvidia_cufft

In [19]:
import lpips # imports the lpips module for calculating the Learned Perceptual Image Patch Similarity (LPIPS) loss
from skimage.metrics import structural_similarity as ssim # imports the structural_similarity function from the skimage.metrics module for calculating the Structural Similarity Index (SSIM) loss

lpips_loss = lpips.LPIPS(net='alex') # initializes the LPIPS loss function with the 'alex' network architecture

def perceptual_loss(y_true, y_pred): # defines a custom loss function called perceptual_loss that combines three different loss components
    mse_loss = tf.keras.losses.MeanSquaredError()(y_true, y_pred) # calculates the Mean Squared Error (MSE) loss between the true and predicted video frames
    ssim_loss = 1 - tf.reduce_mean(tf.image.ssim(y_true, y_pred, max_val=1.0)) # calculates the SSIM loss between the true and predicted video frames
    lpips_loss_value = tf.reduce_mean(lpips_loss(y_true, y_pred)) # calculates the LPIPS loss between the true and predicted video frames
    return mse_loss + ssim_loss + lpips_loss_value # returns the sum of the three loss components as the total perceptual loss

# Compile with improved loss
movie_model.compile(optimizer='adam', loss=perceptual_loss) # compiles the movie_model with the Adam optimizer and the perceptual_loss function

Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:01<00:00, 142MB/s]


Loading model from: /usr/local/lib/python3.11/dist-packages/lpips/weights/v0.1/alex.pth


  self.load_state_dict(torch.load(model_path, map_location='cpu'), strict=False)


In [21]:
from tensorflow.keras.layers import BatchNormalization, Dropout

def build_video_generator(latent_dim=256):
    noise_input = Input(shape=(latent_dim,))
    x = Dense(8*8*512, activation='relu')(noise_input)
    x = BatchNormalization()(x)
    x = Reshape((8, 8, 512))(x)

    x = Conv2DTranspose(256, kernel_size=3, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    x = Conv2DTranspose(128, kernel_size=3, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    x = Conv2DTranspose(64, kernel_size=3, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    video_output = Conv2DTranspose(3, kernel_size=3, strides=1, padding='same', activation='tanh')(x)
    return Model(noise_input, video_output, name="Video_Generator")


In [24]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=10,
    zoom_range=0.2,
    horizontal_flip=True
)

# Augment frames
y_train_augmented = np.array([datagen.random_transform(frame) for frame in y_train])


In [20]:
def build_optimized_video_generator(latent_dim=256):
    noise_input = Input(shape=(latent_dim,))
    x = Dense(8*8*512, activation='relu')(noise_input)
    x = Reshape((8, 8, 512))(x)
    x = Conv2DTranspose(256, kernel_size=3, strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dropout(0.3)(x)  # Prevent Overfitting
    x = Conv2DTranspose(128, kernel_size=3, strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dropout(0.3)(x)
    x = Conv2DTranspose(64, kernel_size=3, strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    # Changed strides to 1 to match the target data shape (64, 64, 3)
    video_output = Conv2DTranspose(3, kernel_size=3, strides=1, padding='same', activation='tanh')(x)
    return Model(noise_input, video_output, name="Optimized_Video_Generator")


In [13]:
# 🔹 Update Movie Generation Model
def build_optimized_movie_model(vocab_size, latent_dim=256):
    text_encoder = build_text_encoder(vocab_size)
    video_generator = build_optimized_video_generator(latent_dim)

    user_input = Input(shape=(None,))
    text_features = text_encoder(user_input)
    generated_video = video_generator(text_features)

    return Model(user_input, generated_video, name="Optimized_Movie_Generation_Model")

# 🔹 Train Optimized Model
optimized_movie_model = build_optimized_movie_model(vocab_size)
optimized_movie_model.compile(optimizer=Adam(learning_rate=0.0001, clipnorm=1.0), loss='mse')  # Gradient Clipping
optimized_movie_model.fit(X_train, y_train, epochs=10, batch_size=16)

print("✅ Model Optimization Complete!")


Epoch 1/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 1s/step - loss: 0.3583
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 1s/step - loss: 0.1007
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - loss: 0.0948
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - loss: 0.0915
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - loss: 0.0894
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - loss: 0.0884
Epoch 7/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - loss: 0.0877
Epoch 8/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 1s/step - loss: 0.0872
Epoch 9/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 1s/step - loss: 0.0869
Epoch 10/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 1s/step - loss: 0.0865
✅ Model O

In [32]:
def generate_movie_trailer(user_input_text):
    # 🔹 Convert user input into model-friendly format
    user_sequence, _ = preprocess_text([user_input_text])

    # 🔹 Generate video using AI model
    generated_video = optimized_movie_model.predict(user_sequence)

    # 🔹 Save the generated video
    video_filename = "generated_trailer.mp4"
    out = cv2.VideoWriter(video_filename, cv2.VideoWriter_fourcc(*'mp4v'), 10, (64, 64))

    for frame in generated_video[0]:
        frame = (frame * 255).astype(np.uint8)  # Convert to valid image format
        out.write(frame)

    out.release()

    print(f"🎬 Trailer Generated! Saved as {video_filename}")

# 🔹 Example Usage
user_input = "A sci-fi thriller about an AI revolution in 2099."
generate_movie_trailer(user_input)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
🎬 Trailer Generated! Saved as generated_trailer.mp4


In [33]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/generate_trailer', methods=['POST'])
def generate_trailer():
    data = request.get_json()
    user_input = data.get("description", "")

    if not user_input:
        return jsonify({"error": "Please provide a movie description"}), 400

    generate_movie_trailer(user_input)

    return jsonify({"message": "Trailer generated!", "file": "generated_trailer.mp4"})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
