In [1]:
# Installations
# pip install -q -U google-generativeai
# pip install pandas
# pip install seaborn
# pip install nltk

In [1]:
# Importing Dependencies
import google.generativeai as genai
import json
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# User input: Getting table description
# table_description = input('Table/Data description')
table_description = 'Data consists of customer level info about different Amazon products along with their customer reviews and ratings.'

In [3]:
# Fetching credentials
f = open('credentials.json', 'r')
creds = json.load(f)
gemini_token = creds['gemini_api']

# Reading data
urls = ['https://drive.google.com/file/d/1Fl8kBzjXvDTcbr0DaUEjdf4lqUfRnK2Z/view?usp=sharing', 'https://drive.google.com/file/d/19jHTwx1iaN5MUftPCltXnna9cJI8C3uU/view?usp=sharing',
        'https://drive.google.com/file/d/1_LWBLQX0EbR1i9Xgbe8RlTRR5Bk6DgtN/view?usp=sharing']

df = pd.DataFrame()
for url in urls[:1]:
    url='https://drive.google.com/uc?id=' + url.split('/')[-2]
    df = pd.concat([df, pd.read_csv(url, low_memory=False)])

df = df.reset_index(drop=True)
print(df.shape)
df.head()

(34660, 21)


Unnamed: 0,id,name,asins,brand,categories,keys,manufacturer,reviews.date,reviews.dateAdded,reviews.dateSeen,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username
0,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,This product so far has not disappointed. My c...,Kindle,,,Adapter
1,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,great for beginner or experienced person. Boug...,very fast,,,truman
2,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,,,DaveZ
3,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,4.0,http://reviews.bestbuy.com/3545/5620406/review...,I've had my Fire HD 8 two weeks now and I love...,Good!!!,,,Shacks
4,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-12T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,,,explore42


In [4]:
generation_config = {
      "temperature": 0.1,
      "top_p": 1,
      "top_k": 1,
    #   "max_output_tokens": max_tokens,
    }
safety_settings = [
    {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
    {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
]
genai.configure(api_key=gemini_token)
model = genai.GenerativeModel('gemini-pro')
model = genai.GenerativeModel(model_name="gemini-1.0-pro",
                                  generation_config=generation_config,
                                  safety_settings=safety_settings)

def generate_response(prompt):
    convo = model.start_chat(history=[])
    convo.send_message(prompt)
    return re.sub(r"\*\*([^*]+)\*\*", r"\1", convo.last.text)

In [5]:
# creating data dictionary/documentation of the table
create_data_dict = f'''Table description: {table_description}
Columns: {df.columns}
Data (First 5 columns): {df.head()}

Instruction:
1. Based on the above mentioned details create a data dictionary which a small description of table, each column and the data type of each column.
2. Don't generate anything else. Be concrete and concise in your response.'''

data_dict = generate_response(create_data_dict)
print(data_dict)

Data Dictionary

Table: Customer Reviews

Description: This table contains customer-level information about different Amazon products along with their customer reviews and ratings.

Columns:

* id: Unique identifier for each row
* name: Name of the product
* asins: ASINs of the product
* brand: Brand of the product
* categories: Categories of the product
* keys: Unique keys for the product
* manufacturer: Manufacturer of the product
* reviews.date: Date of the review
* reviews.dateAdded: Date the review was added
* reviews.dateSeen: Date the review was seen
* reviews.didPurchase: Whether the reviewer purchased the product
* reviews.doRecommend: Whether the reviewer recommends the product
* reviews.id: Unique identifier for each review
* reviews.numHelpful: Number of helpful votes for the review
* reviews.rating: Rating of the product
* reviews.sourceURLs: URLs of the sources where the reviews were found
* reviews.text: Text of the review
* reviews.title: Title of the review
* reviews.u

In [6]:
# preprocessing steps
types_of_preprocessing = f'''Using {data_dict} identify all challenges like missing values, data inconsistency (including data not following the same format), categorical data etc. 
Propose relevant methods to fix them on the columns where necessary.

Instructions:
1. Keep your response consise and concrete
2. Represent each challenge with ">>" at all costs
3. Follow the format shown in sample output at all costs

Sample output:
>> Missing Values 
1. If very less missing values then impute with mean, median or mode
2. Else drop the column(s)

>> Data Incosistency
1. If column: text, bring the data to a uniform format
2. If column: datetime, bring the data to a uniform format
.
.
etc'''

preprocessing_steps = generate_response(types_of_preprocessing)
print(preprocessing_steps)

>> Missing Values
1. Drop the column(s)

>> Data Inconsistency
1. If column: reviews.date, bring the data to a uniform format (e.g., YYYY-MM-DD)
2. If column: reviews.dateAdded, bring the data to a uniform format (e.g., YYYY-MM-DD)
3. If column: reviews.dateSeen, bring the data to a uniform format (e.g., YYYY-MM-DD)

>> Categorical Data
1. If column: categories, create dummy variables for each category


In [7]:
list_methods_prep = preprocessing_steps.split('>>')[1:]
list_methods_prep

[' Missing Values\n1. Drop the column(s)\n\n',
 ' Data Inconsistency\n1. If column: reviews.date, bring the data to a uniform format (e.g., YYYY-MM-DD)\n2. If column: reviews.dateAdded, bring the data to a uniform format (e.g., YYYY-MM-DD)\n3. If column: reviews.dateSeen, bring the data to a uniform format (e.g., YYYY-MM-DD)\n\n',
 ' Categorical Data\n1. If column: categories, create dummy variables for each category']

In [9]:
print('Shape of df before pre-processing: ', df.shape)
for method in list_methods_prep:
    prep_step = f''' Write a python code to perform the preprocessing step by following the actions mentioned with it: "{method}"
    Instructions:
    1. Keep your response consise and concrete
    2. Assume a dataframe with the name "df" already exists
    3. Dataframe df has the following columns: {df.columns}. Use the column names for your refernece while generating the code.
    4. Don't include the code to read the file. Write the code assuming the dataframe is already exists
    5. Add exception handling in the code to make sure it caters to errors and edge cases

    Sharing first 10 columns of the Dataframe for reference: "{df.head(10)}"
    '''
    prep_code = generate_response(prep_step)
    prep_code = prep_code.replace('python', '')
    prep_code = prep_code.replace('`','')
    exec(prep_code)
print('Shape of df after pre-processing: ', df.shape)

Shape of df before pre-processing:  (34660, 7)
Error converting 'reviews.date' column: 'reviews.date'
Error converting 'reviews.dateAdded' column: 'reviews.dateAdded'
Error converting 'reviews.dateSeen' column: Unknown datetime string format, unable to parse: 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z, at position 0




Shape of df after pre-processing:  (34660, 47)


In [12]:
usecase = '''Use case: Marketing'''
prompt = f'''{usecase}\n Refer {df.columns} and tell me top 5 different analysis can be done from the available columns keeping the given use case in mind.

Instructions:
1. Keep your response consise and concrete
2. Give your suggestions in bullet points
3. Don't bold the text in your response
4. Mention which column names can help in the completion of that analysis.
5. Make the analysis rich by including as many relevant columns as possible.
6. Every new type of analysis in the response should be represented with ">>" at all costs!

Expected output:
>> Analysis 1
>> Analysis 2 etc.
'''

types_of_analysis = generate_response(prompt)
print(types_of_analysis)

>> Analysis 1: Identify popular brands and their market share
- brand
- reviews.dateSeen

>> Analysis 2: Determine the most popular categories for marketing campaigns
- categories_Amazon Device Accessories,Kindle Store,Kindle Touch (4th Generation) Accessories,Kindle E-Reader Accessories,Covers,Kindle Touch (4th Generation) Covers
- categories_Amazon Devices & Accessories,Amazon Device Accessories,Power Adapters & Cables,Kindle Store,Kindle E-Reader Accessories,Kindle Paperwhite Accessories
- categories_Back To College,College Electronics,College Tvs & Home Theater,Electronics,Tvs & Home Theater,Streaming Devices,Featured Brands,Amazon Devices,Holiday Shop,Ways To Shop,TV & Home Theater,Streaming Media Players,All Streaming Media Players,TVs Entertainment,Video Games,Kindle Store,Electronics Features,Kids & Family,Fire TV

>> Analysis 3: Analyze customer reviews to identify common pain points and areas for improvement
- reviews.dateSeen
- reviews.sourceURLs

>> Analysis 4: Track the pe

In [13]:
list_analysis = types_of_analysis.split('>>')[1:]
list_analysis

[' Analysis 1: Identify popular brands and their market share\n- brand\n- reviews.dateSeen\n\n',
 ' Analysis 2: Determine the most popular categories for marketing campaigns\n- categories_Amazon Device Accessories,Kindle Store,Kindle Touch (4th Generation) Accessories,Kindle E-Reader Accessories,Covers,Kindle Touch (4th Generation) Covers\n- categories_Amazon Devices & Accessories,Amazon Device Accessories,Power Adapters & Cables,Kindle Store,Kindle E-Reader Accessories,Kindle Paperwhite Accessories\n- categories_Back To College,College Electronics,College Tvs & Home Theater,Electronics,Tvs & Home Theater,Streaming Devices,Featured Brands,Amazon Devices,Holiday Shop,Ways To Shop,TV & Home Theater,Streaming Media Players,All Streaming Media Players,TVs Entertainment,Video Games,Kindle Store,Electronics Features,Kids & Family,Fire TV\n\n',
 ' Analysis 3: Analyze customer reviews to identify common pain points and areas for improvement\n- reviews.dateSeen\n- reviews.sourceURLs\n\n',
 ' An

In [14]:
task = list_analysis[3]
print(task)

query = f'''
Task: {task}

Instructions:
1. Write code in python to only execute the task.
2. Assume a dataframe with the name "df" already exists. 
3. Dataframe df has the following columns: {df.columns}. Use the column names for your refernece while generating the code.
4. Don't include the code to read the file. Write the code assuming the dataframe is already exists.
'''

query_response = generate_response(query)
query_response = query_response.replace('python', '')
query_response = query_response.replace('`','')
print(query_response)

 Analysis 4: Track the performance of different marketing campaigns over time
- reviews.dateSeen



# Group the data by campaign and calculate the average date seen
df_grouped = df.groupby('brand')['reviews.dateSeen'].mean().reset_index()

# Create a line plot of the average date seen over time
sns.lineplot(data=df_grouped, x='brand', y='reviews.dateSeen')
plt.xlabel('Campaign')
plt.ylabel('Average Date Seen')
plt.title('Performance of Marketing Campaigns Over Time')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()



In [15]:
exec(query_response)

TypeError: agg function failed [how->mean,dtype->object]