In [59]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [61]:
df=pd.read_csv("Codecademy_Complaints_Dataset.csv")

In [63]:
df.shape

(5000, 3)

In [65]:
df.head()

Unnamed: 0,Complaint ID,Complaint Text,Category
0,1,Double payment processed for one subscription ...,Payment Issue
1,2,Course progress is not being saved properly on...,Course Access Issue
2,3,Payment failed but amount debited from my acco...,Payment Issue
3,4,Codecademy instructor provided outdated inform...,Instructor Issue
4,5,Course syllabus skipped key topics on Codecademy.,Misleading Course Description


In [67]:
df.duplicated().sum()

0

In [69]:
df["Complaint Text"].duplicated().sum()

4910

In [71]:
df["Complaint Text"].unique()

array(['Double payment processed for one subscription on Codecademy.',
       'Course progress is not being saved properly on Codecademy.',
       'Payment failed but amount debited from my account on Codecademy.',
       'Codecademy instructor provided outdated information.',
       'Course syllabus skipped key topics on Codecademy.',
       'Customer care at Codecademy could not verify my payment successfully.',
       'Codecademy app is not showing the purchased course.',
       'Course instructions on Codecademy are not clear.',
       'Instructor stopped updating the Codecademy course content.',
       'Instructor ignored multiple questions from learners on Codecademy.',
       'The Codecademy course content is outdated and not useful.',
       'The free course on Codecademy required payment midway.',
       'Notifications for project deadlines are not working on Codecademy.',
       'Instructor profile information on Codecademy was misleading.',
       'Support ticket was closed 

In [73]:
(df["Complaint Text"]=='Double payment processed for one subscription on Codecademy.').sum()

54

In [75]:
df[["Complaint Text"]].nunique()

Complaint Text    90
dtype: int64

In [77]:
# dataset contains lot of duplicate complaintes(4910) 

In [79]:
df["Category"].value_counts()

Category
Payment Issue                    580
Misleading Course Description    573
Certificate Issue                565
Customer Support Issue           564
Refund Issue                     564
Technical Glitch                 559
Course Access Issue              550
Instructor Issue                 525
Content Quality Issue            520
Name: count, dtype: int64

In [87]:
df.drop_duplicates(subset='Complaint Text', keep='first',inplace=True)

In [91]:
print("After removing duplicates:")
print(df.shape)

After removing duplicates:
(90, 3)


In [93]:
df[["Complaint Text"]].nunique()

Complaint Text    90
dtype: int64

<h3>Generating 5 realistic variations for each customer complaint.</h3>

In [95]:
import google.generativeai as genai


In [97]:

genai.configure(api_key="AIzaSyAWiXXPwjpe5s2JGsIqBbKVEQymcbh5txU") 

In [99]:
complaint_variation_function = {
        "name": "generate_complaint_variations",
        "description": "Generates 5 realistic and distinct variations of a given customer complaint.",
        "parameters": {
            "type": "OBJECT",
            "properties": {
                "variations": {
                    "type": "ARRAY",
                    "description": "A list of 5 unique variations of the original complaint.",
                    "items": {
                        "type": "STRING",
                        "description": "A single complaint variation."
                    }
                }
            },
            "required": ["variations"]
        }
    }

In [101]:
 model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        tools=[complaint_variation_function]
    )

In [103]:
chat = model.start_chat()

In [105]:
category='Payment Issue'

In [107]:
original_complaint="Double payment processed for one subscription on Codecademy."

In [109]:
prompt = (
                    f"Generate 5 realistic and distinct variations for the following customer complaint "
                    f"in the '{category}' category: \"{original_complaint}\""
                )

In [111]:
response = chat.send_message(
                    prompt,
                    # Force the model to call the function.
                    # 'any' requires the model to call a function. Since we only have one,
                    # it will always be called.
                    tool_config={'function_calling_config': 'any'}
                )

In [113]:
function_call = response.candidates[0].content.parts[0].function_call

In [115]:
function_call

name: "generate_complaint_variations"
args {
  fields {
    key: "variations"
    value {
      list_value {
        values {
          string_value: "I\'ve been charged twice for my Codecademy subscription this month."
        }
        values {
          string_value: "My Codecademy account shows two payments for the same subscription."
        }
        values {
          string_value: "There is a duplicate payment on my credit card statement for Codecademy."
        }
        values {
          string_value: "I was double-billed for my Codecademy subscription. Please refund one payment."
        }
        values {
          string_value: "I noticed two identical charges from Codecademy on my bank statement."
        }
      }
    }
  }
}

In [117]:
args = function_call.args
variations = args.get('variations', [])

In [119]:
variations

["I've been charged twice for my Codecademy subscription this month.", 'My Codecademy account shows two payments for the same subscription.', 'There is a duplicate payment on my credit card statement for Codecademy.', 'I was double-billed for my Codecademy subscription. Please refund one payment.', 'I noticed two identical charges from Codecademy on my bank statement.']

In [121]:
for i in variations:
    print(i)

I've been charged twice for my Codecademy subscription this month.
My Codecademy account shows two payments for the same subscription.
There is a duplicate payment on my credit card statement for Codecademy.
I was double-billed for my Codecademy subscription. Please refund one payment.
I noticed two identical charges from Codecademy on my bank statement.


<h3>creating new dataset</h3>

In [124]:
augmented_data = []

In [126]:
batch_size = 10
for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i + batch_size]

    prompt = "Generate 5 realistic and distinct variations for each of the following customer complaints.\n\n"
    for idx, row in batch.iterrows():
        prompt += f"{idx + 1}. Category: {row['Category']}\n"
        prompt += f"   Complaint: {row['Complaint Text']}\n\n"
    
    prompt += (
        "Return the output as a JSON list of objects with keys: "
        "`category`, `original_complaint`, `variations` (a list of 5 strings)."
    )

    try:
        response = chat.send_message(prompt)
        json_str = response.text.strip()
        json_data = eval(json_str)  # If Gemini returns valid Python-style list; else use json.loads()

        for item in json_data:
            category = item['category']
            original = item['original_complaint']
            variations = item['variations']

            # Add original
            augmented_data.append({
                "category": category,
                "complaint": original
            })

            # Add variations
            for v in variations:
                augmented_data.append({
                    "category": category,
                    "complaint": v
                })

    except Exception as e:
        print(f"Error processing batch {i}-{i+batch_size}")
        print(e)

 11%|█         | 1/9 [00:07<00:56,  7.02s/it]

Error processing batch 0-10
whichOneof


 22%|██▏       | 2/9 [00:14<00:49,  7.05s/it]

Error processing batch 10-20
whichOneof


 33%|███▎      | 3/9 [00:15<00:26,  4.49s/it]

Error processing batch 20-30
whichOneof


 44%|████▍     | 4/9 [00:16<00:16,  3.29s/it]

Error processing batch 30-40
whichOneof


 56%|█████▌    | 5/9 [00:18<00:10,  2.65s/it]

Error processing batch 40-50
whichOneof


 67%|██████▋   | 6/9 [00:19<00:06,  2.25s/it]

Error processing batch 50-60
whichOneof


 78%|███████▊  | 7/9 [00:21<00:03,  1.98s/it]

Error processing batch 60-70
whichOneof


 89%|████████▉ | 8/9 [00:22<00:01,  1.86s/it]

Error processing batch 70-80
whichOneof


100%|██████████| 9/9 [00:24<00:00,  2.72s/it]

Error processing batch 80-90
whichOneof





In [None]:
# Save final augmented dataset
augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('augmented_complaints_dataset.csv', index=False)

In [130]:
augmented_data

[]

In [141]:
df.drop("Complaint ID",inplace=True,axis=1)

In [150]:
df.head()

Unnamed: 0,Complaint Text,Category
0,Double payment processed for one subscription ...,Payment Issue
1,Course progress is not being saved properly on...,Course Access Issue
2,Payment failed but amount debited from my acco...,Payment Issue
3,Codecademy instructor provided outdated inform...,Instructor Issue
4,Course syllabus skipped key topics on Codecademy.,Misleading Course Description


In [152]:
df.to_csv("unique_vlaue_dataset.csv",index=False)

In [154]:
df.shape

(90, 2)