# Basic GAN for Generating Synthetic Customer Feedback Data

This notebook implements a basic Generative Adversarial Network (GAN) to generate synthetic customer feedback data for analysis. The dataset `balanced_df.csv` contains columns like `reviewer_id`, `store_location`, `latitude`, `longitude`, `date`, `month`, `year`, `title`, `review`, and `review-label`. The GAN focuses on generating synthetic `review-label` values for simplicity.

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam

# Load the dataset
data = pd.read_csv('balanced_df.csv')

# Focus on relevant columns
reviews = data['review'].astype(str)  # Convert to string to avoid errors
labels = data['review-label']

# Handle missing values
reviews = reviews.fillna('')  # Fill NaNs with an empty string

# Encode labels
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

# Calculate review lengths
review_lengths = reviews.apply(len).values  # A basic feature for representation

# Define the data to generate (reshape if necessary)
input_data = np.column_stack((encoded_labels, review_lengths))
input_data = input_data.reshape(-1, 2)  # Shape: (num_samples, 2)

## Build the GAN

In [11]:
# Adjust the Generator model to accept two features
def build_generator():
    model = Sequential()
    model.add(Dense(128, input_dim=100))  # Noise input
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(2, activation='tanh'))  # Output two values: encoded label and review length
    return model

# Adjust the Discriminator model to accept two features
def build_discriminator():
    model = Sequential()
    model.add(Dense(512, input_dim=2))  # Input is two features
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))  # Output is a binary classification (real/fake)
    return model

## Compile the GAN

In [12]:
# Compile GAN
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

# Initialize and compile
generator = build_generator()
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])
gan = build_gan(generator, discriminator)
gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Train the GAN

In [13]:
# Train GAN function with adjusted batch size
def train_gan(epochs, batch_size):
    for epoch in range(epochs):
        idx = np.random.randint(0, input_data.shape[0], batch_size)
        real_data = input_data[idx]

        noise = np.random.normal(0, 1, (batch_size, 100))
        generated_data = generator.predict(noise)

        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.normal(0, 1, (batch_size, 100))
        valid_labels = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_labels)

        if epoch % 100 == 0:
            print(f"{epoch} [D loss: {d_loss[0]} | D accuracy: {100 * d_loss[1]}] [G loss: {g_loss}]")

# Start training
train_gan(epochs=200, batch_size=8)  # Reduced epochs for quicker testing

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step




0 [D loss: 0.4042819142341614 | D accuracy: 81.25] [G loss: [array(0.5019628, dtype=float32), array(0.5019628, dtype=float32), array(0.625, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

## Generate Synthetic Feedback

In [15]:
# Generate synthetic customer feedback data
def generate_synthetic_data(num_samples):
    noise = np.random.normal(0, 1, (num_samples, 100))  # Generate random noise
    generated_data = generator.predict(noise)  # Use generator to create synthetic data

    # Post-process the generated data
    generated_labels = np.clip(np.round(generated_data[:, 0] * (encoder.classes_.size - 1)).astype(int), 0, encoder.classes_.size - 1)  # Map back to label space, clip to valid range
    generated_lengths = np.clip(np.round(generated_data[:, 1] * max(review_lengths)).astype(int), 1, max(review_lengths))  # Clip lengths to valid range

    # Inverse transform the labels to original label space
    real_labels = encoder.inverse_transform(generated_labels)

    return real_labels, generated_lengths

# Generate 10 synthetic samples (adjust as needed)
synthetic_labels, synthetic_lengths = generate_synthetic_data(10)

# Now, display the data in a format similar to your CSV file
import random

def generate_dummy_row(i, label, length):
    return {
        'reviewer_id': random.randint(10000, 99999),  # Random reviewer_id
        'store_location': random.choice(['US', 'CA', 'UK']),  # Random store location
        'latitude': round(random.uniform(-90.0, 90.0), 6),  # Random latitude
        'longitude': round(random.uniform(-180.0, 180.0), 6),  # Random longitude
        'date': f"2023-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}",  # Random date
        'month': random.randint(1, 12),  # Random month
        'year': random.randint(2021, 2023),  # Random year
        'title': f"Generated Review Title {i+1}",  # Placeholder title
        'review': f"This is a synthetic review of approximately {length} characters.",  # Placeholder review text
        'review-label': label  # Synthetic review label
    }

# Create a list of rows with the generated synthetic data
synthetic_data = [generate_dummy_row(i, synthetic_labels[i], synthetic_lengths[i]) for i in range(10)]

# Display the data as a DataFrame to make it look like the original CSV structure
synthetic_df = pd.DataFrame(synthetic_data)

# Show the synthetic data
synthetic_df


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


Unnamed: 0,reviewer_id,store_location,latitude,longitude,date,month,year,title,review,review-label
0,49725,UK,-43.272991,30.254786,2023-06-07,12,2022,Generated Review Title 1,This is a synthetic review of approximately 52...,1
1,79581,UK,55.473214,-143.322155,2023-08-06,11,2023,Generated Review Title 2,This is a synthetic review of approximately 52...,1
2,11330,UK,-47.154838,-77.224144,2023-05-27,12,2022,Generated Review Title 3,This is a synthetic review of approximately 52...,1
3,71008,US,-71.892272,-84.702139,2023-05-05,12,2021,Generated Review Title 4,This is a synthetic review of approximately 52...,1
4,66262,US,84.74084,9.564867,2023-11-12,8,2021,Generated Review Title 5,This is a synthetic review of approximately 52...,1
5,16076,US,10.698682,35.369985,2023-06-10,10,2021,Generated Review Title 6,This is a synthetic review of approximately 52...,1
6,64998,US,61.468941,-51.493211,2023-08-09,4,2023,Generated Review Title 7,This is a synthetic review of approximately 52...,1
7,74134,CA,-59.515921,-55.909587,2023-04-13,10,2023,Generated Review Title 8,This is a synthetic review of approximately 52...,1
8,78909,UK,83.137867,27.666779,2023-12-04,3,2021,Generated Review Title 9,This is a synthetic review of approximately 52...,1
9,65407,US,-70.702279,68.254637,2023-02-23,4,2021,Generated Review Title 10,This is a synthetic review of approximately 52...,1
