In [6]:
import pandas as pd
import psycopg2
from sql import engine

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [7]:
# read test-data from csv-file
df = pd.read_csv('keyword_data/google_ads_keyword_list.csv', skiprows=2)
df.head()

Unnamed: 0,Keyword,Currency,Segmentation,Avg. monthly searches,Three month change,YoY change,Competition,Competition (indexed value),Top of page bid (low range),Top of page bid (high range),Ad impression share,Organic average position,Organic impression share,In Account,Searches: Mar 2021,Searches: Apr 2021,Searches: May 2021,Searches: Jun 2021,Searches: Jul 2021,Searches: Aug 2021,Searches: Sep 2021,Searches: Oct 2021,Searches: Nov 2021,Searches: Dec 2021,Searches: Jan 2022,Searches: Feb 2022
0,,,Alle,3735310.0,,,,,,,,,,,,,,,,,,,,,,
1,,,Deutschland,3735310.0,,,,,,,,,,,,,,,,,,,,,,
2,1.27 mm pitch ribbon cable,EUR,,10.0,0%,-100%,,,,,,,,,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0
3,10 conductor ribbon cable,EUR,,0.0,0%,0%,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10 pin flat cable,EUR,,10.0,0%,0%,,,,,,,,,0.0,10.0,10.0,10.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0


In [8]:
# rename columns
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   keyword                       992 non-null    object 
 1   currency                      992 non-null    object 
 2   segmentation                  2 non-null      object 
 3   avg._monthly_searches         994 non-null    float64
 4   three_month_change            992 non-null    object 
 5   yoy_change                    992 non-null    object 
 6   competition                   910 non-null    object 
 7   competition_(indexed_value)   910 non-null    float64
 8   top_of_page_bid_(low_range)   758 non-null    object 
 9   top_of_page_bid_(high_range)  758 non-null    object 
 10  ad_impression_share           0 non-null      float64
 11  organic_average_position      0 non-null      float64
 12  organic_impression_share      0 non-null      float64
 13  in_ac

In [14]:
# drop rows with NaN value at keyword column
df = df.dropna(subset=['keyword']).reset_index(drop=True)
df.head()

Unnamed: 0,keyword,currency,segmentation,avg._monthly_searches,three_month_change,yoy_change,competition,competition_(indexed_value),top_of_page_bid_(low_range),top_of_page_bid_(high_range),ad_impression_share,organic_average_position,organic_impression_share,in_account,searches:_mar_2021,searches:_apr_2021,searches:_may_2021,searches:_jun_2021,searches:_jul_2021,searches:_aug_2021,searches:_sep_2021,searches:_oct_2021,searches:_nov_2021,searches:_dec_2021,searches:_jan_2022,searches:_feb_2022
0,1.27 mm pitch ribbon cable,EUR,,10.0,0%,-100%,,,,,,,,,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0
1,10 conductor ribbon cable,EUR,,0.0,0%,0%,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10 pin flat cable,EUR,,10.0,0%,0%,,,,,,,,,0.0,10.0,10.0,10.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0
3,10 pin flat ribbon cable,EUR,,10.0,-100%,-100%,,,,,,,,,0.0,0.0,10.0,10.0,0.0,10.0,10.0,10.0,0.0,10.0,0.0,0.0
4,12v dc dc converter,EUR,,140.0,22%,0%,Hoch,100.0,42.0,104.0,,,,,140.0,140.0,140.0,140.0,140.0,140.0,90.0,140.0,140.0,90.0,140.0,110.0


In [24]:
# keep only columns we need
df = df[['keyword', 'avg._monthly_searches', 'competition', 'competition_(indexed_value)', 'top_of_page_bid_(low_range)', 'top_of_page_bid_(high_range)']]
df.head()

Unnamed: 0,keyword,avg._monthly_searches,competition,competition_(indexed_value),top_of_page_bid_(low_range),top_of_page_bid_(high_range)
0,1.27 mm pitch ribbon cable,10.0,,,,
1,10 conductor ribbon cable,0.0,,,,
2,10 pin flat cable,10.0,,,,
3,10 pin flat ribbon cable,10.0,,,,
4,12v dc dc converter,140.0,Hoch,100.0,42.0,104.0


In [27]:
# rename columns
df.columns = ['keyword', 'avg_monthly_searches', 'competition', 'competition_indexed', 'bid_low_range', 'bid_high_range']
df.head()

Unnamed: 0,keyword,avg_monthly_searches,competition,competition_indexed,bid_low_range,bid_high_range
0,1.27 mm pitch ribbon cable,10.0,,,,
1,10 conductor ribbon cable,0.0,,,,
2,10 pin flat cable,10.0,,,,
3,10 pin flat ribbon cable,10.0,,,,
4,12v dc dc converter,140.0,Hoch,100.0,42.0,104.0


In [48]:
# change object to float for two columns
df[['bid_low_range', 'bid_high_range']] = df[['bid_low_range', 'bid_high_range']].replace(',','.', regex=True)
df = df.astype({'bid_low_range':'float','bid_high_range':'float'})
df.info()

In [52]:
df.head()

Unnamed: 0,keyword,avg_monthly_searches,competition,competition_indexed,bid_low_range,bid_high_range
0,1.27 mm pitch ribbon cable,10.0,,,,
1,10 conductor ribbon cable,0.0,,,,
2,10 pin flat cable,10.0,,,,
3,10 pin flat ribbon cable,10.0,,,,
4,12v dc dc converter,140.0,Hoch,100.0,0.42,1.04


In [53]:
# write data to sql
table_name = 'keyword_data'
if engine!=None:
    try:
        df.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema='capstone_group2', # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The keyword_data table was imported successfully.
