In [6]:
import pandas as pd
import psycopg2
from sql import engine

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Google keywords

In [7]:
# read test-data from csv-file
df = pd.read_csv('keyword_data/google_ads_keyword_list.csv', skiprows=2)
df.head()

Unnamed: 0,Keyword,Currency,Segmentation,Avg. monthly searches,Three month change,YoY change,Competition,Competition (indexed value),Top of page bid (low range),Top of page bid (high range),Ad impression share,Organic average position,Organic impression share,In Account,Searches: Mar 2021,Searches: Apr 2021,Searches: May 2021,Searches: Jun 2021,Searches: Jul 2021,Searches: Aug 2021,Searches: Sep 2021,Searches: Oct 2021,Searches: Nov 2021,Searches: Dec 2021,Searches: Jan 2022,Searches: Feb 2022
0,,,Alle,3735310.0,,,,,,,,,,,,,,,,,,,,,,
1,,,Deutschland,3735310.0,,,,,,,,,,,,,,,,,,,,,,
2,1.27 mm pitch ribbon cable,EUR,,10.0,0%,-100%,,,,,,,,,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0
3,10 conductor ribbon cable,EUR,,0.0,0%,0%,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10 pin flat cable,EUR,,10.0,0%,0%,,,,,,,,,0.0,10.0,10.0,10.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0


In [8]:
# rename columns
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   keyword                       992 non-null    object 
 1   currency                      992 non-null    object 
 2   segmentation                  2 non-null      object 
 3   avg._monthly_searches         994 non-null    float64
 4   three_month_change            992 non-null    object 
 5   yoy_change                    992 non-null    object 
 6   competition                   910 non-null    object 
 7   competition_(indexed_value)   910 non-null    float64
 8   top_of_page_bid_(low_range)   758 non-null    object 
 9   top_of_page_bid_(high_range)  758 non-null    object 
 10  ad_impression_share           0 non-null      float64
 11  organic_average_position      0 non-null      float64
 12  organic_impression_share      0 non-null      float64
 13  in_ac

In [14]:
# drop rows with NaN value at keyword column
df = df.dropna(subset=['keyword']).reset_index(drop=True)
df.head()

Unnamed: 0,keyword,currency,segmentation,avg._monthly_searches,three_month_change,yoy_change,competition,competition_(indexed_value),top_of_page_bid_(low_range),top_of_page_bid_(high_range),ad_impression_share,organic_average_position,organic_impression_share,in_account,searches:_mar_2021,searches:_apr_2021,searches:_may_2021,searches:_jun_2021,searches:_jul_2021,searches:_aug_2021,searches:_sep_2021,searches:_oct_2021,searches:_nov_2021,searches:_dec_2021,searches:_jan_2022,searches:_feb_2022
0,1.27 mm pitch ribbon cable,EUR,,10.0,0%,-100%,,,,,,,,,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0
1,10 conductor ribbon cable,EUR,,0.0,0%,0%,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10 pin flat cable,EUR,,10.0,0%,0%,,,,,,,,,0.0,10.0,10.0,10.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0
3,10 pin flat ribbon cable,EUR,,10.0,-100%,-100%,,,,,,,,,0.0,0.0,10.0,10.0,0.0,10.0,10.0,10.0,0.0,10.0,0.0,0.0
4,12v dc dc converter,EUR,,140.0,22%,0%,Hoch,100.0,42.0,104.0,,,,,140.0,140.0,140.0,140.0,140.0,140.0,90.0,140.0,140.0,90.0,140.0,110.0


In [24]:
# keep only columns we need
df = df[['keyword', 'avg._monthly_searches', 'competition', 'competition_(indexed_value)', 'top_of_page_bid_(low_range)', 'top_of_page_bid_(high_range)']]
df.head()

Unnamed: 0,keyword,avg._monthly_searches,competition,competition_(indexed_value),top_of_page_bid_(low_range),top_of_page_bid_(high_range)
0,1.27 mm pitch ribbon cable,10.0,,,,
1,10 conductor ribbon cable,0.0,,,,
2,10 pin flat cable,10.0,,,,
3,10 pin flat ribbon cable,10.0,,,,
4,12v dc dc converter,140.0,Hoch,100.0,42.0,104.0


In [27]:
# rename columns
df.columns = ['keyword', 'avg_monthly_searches', 'competition', 'competition_indexed', 'bid_low_range', 'bid_high_range']
df.head()

Unnamed: 0,keyword,avg_monthly_searches,competition,competition_indexed,bid_low_range,bid_high_range
0,1.27 mm pitch ribbon cable,10.0,,,,
1,10 conductor ribbon cable,0.0,,,,
2,10 pin flat cable,10.0,,,,
3,10 pin flat ribbon cable,10.0,,,,
4,12v dc dc converter,140.0,Hoch,100.0,42.0,104.0


In [48]:
# change object to float for two columns
df[['bid_low_range', 'bid_high_range']] = df[['bid_low_range', 'bid_high_range']].replace(',','.', regex=True)
df = df.astype({'bid_low_range':'float','bid_high_range':'float'})
df.info()

In [77]:
# add platform name
df['platform'] = 'google'

In [87]:
# calculate average of bid low range and high range per row
df['suggested_bid'] = (df['bid_low_range'] + df['bid_high_range']) / 2
df['suggested_bid'] = df['suggested_bid'].round(decimals=2)

In [88]:
df.head(10)

Unnamed: 0,keyword,avg_monthly_searches,competition,competition_indexed,bid_low_range,bid_high_range,platform,suggested_bid
0,1.27 mm pitch ribbon cable,10.0,,,,,google,
1,10 conductor ribbon cable,0.0,,,,,google,
2,10 pin flat cable,10.0,,,,,google,
3,10 pin flat ribbon cable,10.0,,,,,google,
4,12v dc dc converter,140.0,Hoch,100.0,0.42,1.04,google,0.73
5,16 pin flat cable,10.0,,,,,google,
6,16 pin flat ribbon cable,10.0,Hoch,100.0,,,google,
7,16 pin flat ribbon cable connector,10.0,Hoch,100.0,,,google,
8,16 pin ribbon connector,10.0,Hoch,100.0,,,google,
9,1hs01g,10.0,,,,,google,


## Bing keywords

In [61]:
# read bing data from csv
df1 = pd.read_csv('keyword_data/bing_keyword_list.csv', na_values=['-'])
df1.head()

Unnamed: 0,Ad group,Keyword,Average monthly searches,Competition,Suggested Bid,Ad impr. share
0,Ribbon Cable,1.27 mm pitch ribbon cable,,,,
1,Ribbon Cable,10 conductor ribbon cable,,,,
2,Ribbon Cable,1mm pitch ribbon cable,,,,
3,Ribbon Cable,4 conductor ribbon cable,,,,
4,Ribbon Cable,4 wire ribbon cable,,,,


In [62]:
# rename columns
df1.columns = df1.columns.str.lower()
df1.columns = df1.columns.str.replace(' ', '_')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ad_group                  883 non-null    object 
 1   keyword                   883 non-null    object 
 2   average_monthly_searches  683 non-null    object 
 3   competition               414 non-null    float64
 4   suggested_bid             272 non-null    float64
 5   ad_impr._share            0 non-null      float64
dtypes: float64(3), object(3)
memory usage: 41.5+ KB


In [67]:
# change object to float for two columns
df1['average_monthly_searches'] = df1['average_monthly_searches'].replace(',','.', regex=True)
df1 = df1.astype({'average_monthly_searches':'float'})
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ad_group                  883 non-null    object 
 1   keyword                   883 non-null    object 
 2   average_monthly_searches  683 non-null    float64
 3   competition               414 non-null    float64
 4   suggested_bid             272 non-null    float64
 5   ad_impr._share            0 non-null      float64
dtypes: float64(4), object(2)
memory usage: 41.5+ KB


In [72]:
# drop columns we don't need
df1.drop('ad_impr._share', axis=1, inplace=True)

In [73]:
# rename columns
df1.rename(columns={'competition': 'competition_indexed', 'average_monthly_searches': 'avg_monthly_searches'}, inplace=True)

In [75]:
# add platform name
df1['platform'] = 'bing'

In [76]:
df1.head(10)

Unnamed: 0,ad_group,keyword,avg_monthly_searches,competition_indexed,suggested_bid,platform
0,Ribbon Cable,1.27 mm pitch ribbon cable,,,,bing
1,Ribbon Cable,10 conductor ribbon cable,,,,bing
2,Ribbon Cable,1mm pitch ribbon cable,,,,bing
3,Ribbon Cable,4 conductor ribbon cable,,,,bing
4,Ribbon Cable,4 wire ribbon cable,,,,bing
5,Ribbon Cable,cable ribbon,,,,bing
6,Ribbon Cable,idc ribbon cable,10.0,0.51,,bing
7,Ribbon Cable,ribbon cable 2.54 mm pitch,,,,bing
8,Ribbon Cable,ribbon cable adapter,,,,bing
9,Ribbon Cable,ribbon cable pitch,10.0,,,bing


## concat google and bing dataframes and write to sql

In [89]:
df_all = pd.concat([df, df1])
df_all.head()

Unnamed: 0,keyword,avg_monthly_searches,competition,competition_indexed,bid_low_range,bid_high_range,platform,suggested_bid,ad_group
0,1.27 mm pitch ribbon cable,10.0,,,,,google,,
1,10 conductor ribbon cable,0.0,,,,,google,,
2,10 pin flat cable,10.0,,,,,google,,
3,10 pin flat ribbon cable,10.0,,,,,google,,
4,12v dc dc converter,140.0,Hoch,100.0,0.42,1.04,google,0.73,


In [90]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1875 entries, 0 to 882
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   keyword               1875 non-null   object 
 1   avg_monthly_searches  1675 non-null   float64
 2   competition           910 non-null    object 
 3   competition_indexed   1324 non-null   float64
 4   bid_low_range         758 non-null    float64
 5   bid_high_range        758 non-null    float64
 6   platform              1875 non-null   object 
 7   suggested_bid         1030 non-null   float64
 8   ad_group              883 non-null    object 
dtypes: float64(5), object(4)
memory usage: 146.5+ KB


In [91]:
# write data to sql
table_name = 'keyword_data'
if engine!=None:
    try:
        df_all.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema='capstone_group2', # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The keyword_data table was imported successfully.
