In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

In [2]:
Web_Analytics="data/raw/web_analytics.csv"

In [3]:
# Database Configuration
DB_USER=os.getenv("DB_USER","root")
DB_PWD=os.getenv("DB_PWD","root")
DB_HOST=os.getenv("DB_HOST","localhost")
DB_PORT=os.getenv("DB_PORT","3306")
DB_NAME=os.getenv("DB_NAME","customer_segmentationdb")

In [4]:
# SQLAlchemy connection string for MySQL (using PyMySQL driver)
DATABASE_URL=(f"mysql+pymysql://{DB_USER}:{DB_PWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

In [5]:
engine=create_engine(DATABASE_URL,connect_args={"charset":"utf8mb4"})

In [6]:
df=pd.read_csv(Web_Analytics)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Source / Medium        250 non-null    object
 1   Year                   250 non-null    int64 
 2   Month of the year      250 non-null    int64 
 3   Users                  250 non-null    object
 4   New Users              250 non-null    object
 5   Sessions               250 non-null    object
 6   Bounce Rate            250 non-null    object
 7   Pageviews              250 non-null    object
 8   Avg. Session Duration  250 non-null    object
 9   Conversion Rate (%)    250 non-null    object
 10  Transactions           250 non-null    object
 11  Revenue                250 non-null    object
 12  Quantity Sold          250 non-null    object
dtypes: int64(2), object(11)
memory usage: 25.5+ KB


In [8]:
df.columns

Index(['Source / Medium', 'Year', 'Month of the year', 'Users', 'New Users',
       'Sessions', 'Bounce Rate', 'Pageviews', 'Avg. Session Duration',
       'Conversion Rate (%)', 'Transactions', 'Revenue', 'Quantity Sold'],
      dtype='object')

In [9]:
# Normalize column names
df.columns=(
    df.columns
    .str.strip()
    .str.replace(" ","_")
    .str.replace(r"[\.%]","",regex=True)
)
df.columns

Index(['Source_/_Medium', 'Year', 'Month_of_the_year', 'Users', 'New_Users',
       'Sessions', 'Bounce_Rate', 'Pageviews', 'Avg_Session_Duration',
       'Conversion_Rate_()', 'Transactions', 'Revenue', 'Quantity_Sold'],
      dtype='object')

In [10]:
df=df.rename(columns={
        "Source_/_Medium":"Source",
        "Month_of_the_year":"Month_num",
        "Conversion_Rate_()":"Conversion_Rate",
    })
df.columns

Index(['Source', 'Year', 'Month_num', 'Users', 'New_Users', 'Sessions',
       'Bounce_Rate', 'Pageviews', 'Avg_Session_Duration', 'Conversion_Rate',
       'Transactions', 'Revenue', 'Quantity_Sold'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Source                250 non-null    object
 1   Year                  250 non-null    int64 
 2   Month_num             250 non-null    int64 
 3   Users                 250 non-null    object
 4   New_Users             250 non-null    object
 5   Sessions              250 non-null    object
 6   Bounce_Rate           250 non-null    object
 7   Pageviews             250 non-null    object
 8   Avg_Session_Duration  250 non-null    object
 9   Conversion_Rate       250 non-null    object
 10  Transactions          250 non-null    object
 11  Revenue               250 non-null    object
 12  Quantity_Sold         250 non-null    object
dtypes: int64(2), object(11)
memory usage: 25.5+ KB


In [12]:
# Checking for missing values
df.isnull().sum()

Source                  0
Year                    0
Month_num               0
Users                   0
New_Users               0
Sessions                0
Bounce_Rate             0
Pageviews               0
Avg_Session_Duration    0
Conversion_Rate         0
Transactions            0
Revenue                 0
Quantity_Sold           0
dtype: int64

In [13]:
# Checking for duplicates
df.duplicated().sum()

np.int64(0)

In [14]:
numeric_cols=["Users","New_Users","Sessions","Pageviews","Transactions","Revenue","Quantity_Sold","Bounce_Rate","Conversion_Rate"]
for col in numeric_cols:
    if col in df.columns:
        df[col]=(
            df[col]
            .astype(str)
            .str.replace(",","",regex=False)
            .str.replace("%","",regex=False)
            .str.replace("<","",regex=False)
            .str.strip()
            .replace({"":None})
        )

In [15]:
df["Bounce_Rate"]=df["Bounce_Rate"].astype(float)/100
df["Conversion_Rate"]=df["Conversion_Rate"].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Source                250 non-null    object 
 1   Year                  250 non-null    int64  
 2   Month_num             250 non-null    int64  
 3   Users                 250 non-null    object 
 4   New_Users             250 non-null    object 
 5   Sessions              250 non-null    object 
 6   Bounce_Rate           250 non-null    float64
 7   Pageviews             250 non-null    object 
 8   Avg_Session_Duration  250 non-null    object 
 9   Conversion_Rate       250 non-null    float64
 10  Transactions          250 non-null    object 
 11  Revenue               250 non-null    object 
 12  Quantity_Sold         250 non-null    object 
dtypes: float64(2), int64(2), object(9)
memory usage: 25.5+ KB


In [16]:
# To combine year and month_num column to a single column "date"
df["Year"]=df["Year"].astype(int)
df["Month_num"]=df["Month_num"].astype(int)
df["Date"]=pd.to_datetime(df["Year"].astype(str)+"-"+df["Month_num"].astype(str)+"-01",format="%Y-%m-%d",errors="coerce")

# Drop columns year and month_num
df = df.drop(columns=["Year", "Month_num"])

In [17]:
# Parse "avg_session_duration" to seconds
df["Avg_Session_Duration_secs"]=(pd.to_timedelta(df["Avg_Session_Duration"], errors="coerce").dt.total_seconds())

# Drop column avg_session_duration
df=df.drop(columns=["Avg_Session_Duration"])

In [18]:
# Removing the entry "euromessage/push" 
df.drop(df[df["Source"]=="euromessage / push"].index,inplace=True)

In [19]:
cols=["Users","New_Users","Sessions","Pageviews","Transactions","Revenue","Quantity_Sold"]
for col in cols:
    df[col]=df[col].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Source                     249 non-null    object        
 1   Users                      249 non-null    int64         
 2   New_Users                  249 non-null    int64         
 3   Sessions                   249 non-null    int64         
 4   Bounce_Rate                249 non-null    float64       
 5   Pageviews                  249 non-null    int64         
 6   Conversion_Rate            249 non-null    float64       
 7   Transactions               249 non-null    int64         
 8   Revenue                    249 non-null    int64         
 9   Quantity_Sold              249 non-null    int64         
 10  Date                       249 non-null    datetime64[ns]
 11  Avg_Session_Duration_secs  249 non-null    float64       
dtypes: datet

In [20]:
df.head(40)

Unnamed: 0,Source,Users,New_Users,Sessions,Bounce_Rate,Pageviews,Conversion_Rate,Transactions,Revenue,Quantity_Sold,Date,Avg_Session_Duration_secs
0,A,126870,104020,194667,0.7159,455159,0.2,394,83244,482,2019-11-01,71.0
1,A,120625,98574,194114,0.6456,559509,0.69,1347,203552,2402,2020-05-01,92.0
2,A,123361,104308,181175,0.4191,368907,0.26,477,94282,599,2019-10-01,65.0
3,A,106551,88428,170329,0.7592,368803,0.18,311,54971,415,2019-09-01,61.0
4,A,102123,82461,163446,0.671,425410,0.7,1138,167113,1987,2020-06-01,80.0
5,A,91043,70326,142637,0.6706,370798,0.34,486,103001,607,2019-12-01,80.0
6,A,83031,64103,133736,0.6946,373356,0.45,601,128552,777,2020-01-01,83.0
7,A,84343,73239,125423,0.7116,292263,0.58,730,98161,1360,2020-07-01,72.0
8,A,82626,68145,125318,0.7006,328822,0.52,657,126137,981,2020-02-01,84.0
9,A,73844,61557,110546,0.7219,266187,0.48,531,96195,843,2020-03-01,75.0


In [21]:
df.describe()

Unnamed: 0,Users,New_Users,Sessions,Bounce_Rate,Pageviews,Conversion_Rate,Transactions,Revenue,Quantity_Sold,Date,Avg_Session_Duration_secs
count,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249,249.0
mean,11340.522088,8778.967871,16155.417671,0.492649,44287.710843,3.150602,85.393574,14832.104418,150.706827,2020-02-07 22:15:54.216867584,152.534137
min,41.0,1.0,127.0,0.0,237.0,0.0,0.0,0.0,0.0,2019-09-01 00:00:00,2.0
25%,318.0,145.0,399.0,0.3477,1388.0,0.14,2.0,224.0,2.0,2019-11-01 00:00:00,91.0
50%,865.0,493.0,1146.0,0.5254,4457.0,0.42,8.0,1416.0,14.0,2020-02-01 00:00:00,133.0
75%,7869.0,3721.0,10252.0,0.6185,31980.0,0.78,84.0,14964.0,129.0,2020-05-01 00:00:00,205.0
max,126870.0,104308.0,194667.0,0.9863,559509.0,42.08,1347.0,203552.0,2402.0,2020-08-01 00:00:00,409.0
std,24555.407501,20618.100558,35672.225385,0.202619,94040.501564,8.138685,178.167978,29800.642723,328.732252,,78.123402


In [22]:
# Saving cleaned and processed data into a CSV file
output_csv=os.path.join("data/processed","processed_web_analytics.csv")
df.to_csv(output_csv,index=False)
print(f"Saved processed web_analytics to:{output_csv}")

Saved processed web_analytics to:data/processed\processed_web_analytics.csv


In [23]:
# Loading the processed data to MySQL
df.to_sql(
        name="web_analytics",
        con=engine,
        if_exists="replace",
        index=False,
        method="multi",
        chunksize=5000
    )
print("Loaded data into MySQL table `web_analytics`\n")

Loaded data into MySQL table `web_analytics`

