In [68]:
import pandas as pd
import pymongo
from datetime import datetime

# client = pymongo.MongoClient("mongodb://172.31.99.238:27017")
client = pymongo.MongoClient("mongodb://127.0.0.1:27017") # for without docker
# clinet = pymongo.MongoClient("mongodb://host.docker.internal:27017") # for docker on windows
# client = pymongo.MongoClient("mongodb://172.17.0.1:27017") # for docker on linux (ubuntu)


In [69]:
# Connect to the database
db = client["ad_response_analysis_tf"]

# Retrieve the list of collections (tables)
collections = db.list_collection_names()

# Print the list of collections
print("Collections in the database:")
for collection in collections:
    print(collection)


Collections in the database:
survey_respondents
ad_metrics
advertisement_info
demographic_data
purchase_info
responses_to_ads
ad_demographic_link


In [70]:
# Collections to retrieve
collections_to_retrieve = ["advertisement_info", "ad_metrics"]
data_collection= []
# Retrieve data from specified collections and convert to DataFrame
for collection_name in collections_to_retrieve:
    collection = db[collection_name]
    data = list(collection.find())
    data_collection.append(data)
    df = pd.DataFrame(data)
    print(f"Data from collection '{collection_name}':")
    print(df.head(1))


Data from collection 'advertisement_info':
                        _id  AdID  AdCost  AdDuration AdPlatformName  \
0  67345fc899a5b4008cab4ef3     1     725         445       Facebook   

  AdPlatformType AdTopic AdType  PurchaseAmount  
0   Social Media  Sports  Video             680  
Data from collection 'ad_metrics':
                        _id  AdID  Click_Through_Rate  Conversion_Rate  \
0  67345fc899a5b4008cab5039     1            0.468085         0.031915   

  Mode_Engagement_Time  
0                  180  


In [71]:
advertisement_info = pd.DataFrame(data_collection[0])
ad_metrics = pd.DataFrame(data_collection[1])

In [72]:
print(advertisement_info.head())
print(ad_metrics.head())

                        _id  AdID  AdCost  AdDuration AdPlatformName  \
0  67345fc899a5b4008cab4ef3     1     725         445       Facebook   
1  67345fc899a5b4008cab4ef4     2     466         544        Netflix   
2  67345fc899a5b4008cab4ef5     3     143           0          Yahoo   
3  67345fc899a5b4008cab4ef6     4     265           0        Threads   
4  67345fc899a5b4008cab4ef7     5     113         573        Twitter   

  AdPlatformType     AdTopic  AdType  PurchaseAmount  
0   Social Media      Sports   Video             680  
1      Streaming  Healthcare   Video            2403  
2  Search Engine  Technology  Banner             510  
3   Social Media     Fashion    Text            2626  
4   Social Media   Political   Video            2428  
                        _id  AdID  Click_Through_Rate  Conversion_Rate  \
0  67345fc899a5b4008cab5039     1            0.468085         0.031915   
1  67345fc899a5b4008cab503b     2            0.489583         0.052083   
2  67345fc899a5

In [73]:
advertisement_info = advertisement_info.drop(columns=['_id', 'AdDuration'])
ad_metrics = ad_metrics.drop(columns=['_id', 'Mode_Engagement_Time'])



In [74]:
# Print the length of the DataFrames
print("Length of advertisement_info DataFrame:", len(advertisement_info))
print("Length of ad_metrics DataFrame:", len(ad_metrics))

# Check for null values
print("Null values in advertisement_info DataFrame:")
print(advertisement_info.isnull().sum())

print("Null values in ad_metrics DataFrame:")
print(ad_metrics.isnull().sum())

# Print the data types of each column
print("Data types in advertisement_info DataFrame:")
print(advertisement_info.dtypes)

print("Data types in ad_metrics DataFrame:")
print(ad_metrics.dtypes)

Length of advertisement_info DataFrame: 100
Length of ad_metrics DataFrame: 110
Null values in advertisement_info DataFrame:
AdID              0
AdCost            0
AdPlatformName    0
AdPlatformType    0
AdTopic           0
AdType            0
PurchaseAmount    0
dtype: int64
Null values in ad_metrics DataFrame:
AdID                  0
Click_Through_Rate    0
Conversion_Rate       0
dtype: int64
Data types in advertisement_info DataFrame:
AdID               int64
AdCost             int64
AdPlatformName    object
AdPlatformType    object
AdTopic           object
AdType            object
PurchaseAmount     int64
dtype: object
Data types in ad_metrics DataFrame:
AdID                    int64
Click_Through_Rate    float64
Conversion_Rate       float64
dtype: object


In [75]:
# Convert object columns to string
advertisement_info['AdPlatformName'] = advertisement_info['AdPlatformName'].astype(str)
advertisement_info['AdPlatformType'] = advertisement_info['AdPlatformType'].astype(str)
advertisement_info['AdTopic'] = advertisement_info['AdTopic'].astype(str)
advertisement_info['AdType'] = advertisement_info['AdType'].astype(str)


In [76]:
# Print the data types of each column
print("Data types in advertisement_info DataFrame:")
print(advertisement_info.dtypes)

print("Data types in ad_metrics DataFrame:")
print(ad_metrics.dtypes)

Data types in advertisement_info DataFrame:
AdID               int64
AdCost             int64
AdPlatformName    object
AdPlatformType    object
AdTopic           object
AdType            object
PurchaseAmount     int64
dtype: object
Data types in ad_metrics DataFrame:
AdID                    int64
Click_Through_Rate    float64
Conversion_Rate       float64
dtype: object


In [77]:
print(advertisement_info.head())

   AdID  AdCost AdPlatformName AdPlatformType     AdTopic  AdType  \
0     1     725       Facebook   Social Media      Sports   Video   
1     2     466        Netflix      Streaming  Healthcare   Video   
2     3     143          Yahoo  Search Engine  Technology  Banner   
3     4     265        Threads   Social Media     Fashion    Text   
4     5     113        Twitter   Social Media   Political   Video   

   PurchaseAmount  
0             680  
1            2403  
2             510  
3            2626  
4            2428  


In [78]:
# Merge the two DataFrames on AdID
merged_df = pd.merge(advertisement_info, ad_metrics, on='AdID')
print(merged_df.head())

   AdID  AdCost AdPlatformName AdPlatformType     AdTopic  AdType  \
0     1     725       Facebook   Social Media      Sports   Video   
1     2     466        Netflix      Streaming  Healthcare   Video   
2     3     143          Yahoo  Search Engine  Technology  Banner   
3     4     265        Threads   Social Media     Fashion    Text   
4     5     113        Twitter   Social Media   Political   Video   

   PurchaseAmount  Click_Through_Rate  Conversion_Rate  
0             680            0.468085         0.031915  
1            2403            0.489583         0.052083  
2             510            0.494845         0.082474  
3            2626            0.584906         0.113208  
4            2428            0.478992         0.058824  


In [79]:
print("Data types in merged DataFrame:")
print(merged_df.dtypes)

Data types in merged DataFrame:
AdID                    int64
AdCost                  int64
AdPlatformName         object
AdPlatformType         object
AdTopic                object
AdType                 object
PurchaseAmount          int64
Click_Through_Rate    float64
Conversion_Rate       float64
dtype: object


In [80]:
# Store Click_Through_Rate and Conversion_Rate into separate variables
click_through_rate = merged_df['Click_Through_Rate']
conversion_rate = merged_df['Conversion_Rate']

# Drop AdID, Click_Through_Rate, and Conversion_Rate columns from merged_df
merged_df = merged_df.drop(columns=['AdID', 'Click_Through_Rate', 'Conversion_Rate'])


In [81]:
from sklearn.preprocessing import OneHotEncoder
import joblib

# Columns to be one-hot encoded
categorical_columns = ['AdPlatformName', 'AdPlatformType', 'AdTopic', 'AdType']

# Create an instance of OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
encoded_array = encoder.fit_transform(merged_df[categorical_columns])

# Save the fitted OneHotEncoder
joblib.dump(encoder, '/mnt/c/de/project/Advertisement-Response-Analysis/Backend/api/models/onehot_encoder.joblib')


['/mnt/c/de/project/Advertisement-Response-Analysis/Backend/api/models/onehot_encoder.joblib']

In [82]:
# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns and concatenate the encoded columns
merged_df = merged_df.drop(columns=categorical_columns)
encoded_df = pd.concat([merged_df, encoded_df], axis=1)

In [83]:
# Print all column names of the encoded DataFrame
print("Column names of the encoded DataFrame:")
print(encoded_df.columns)


Column names of the encoded DataFrame:
Index(['AdCost', 'PurchaseAmount', 'AdPlatformName_Amazon Prime',
       'AdPlatformName_Bing', 'AdPlatformName_Facebook',
       'AdPlatformName_Google', 'AdPlatformName_Hotstar',
       'AdPlatformName_Instagram', 'AdPlatformName_JioTV',
       'AdPlatformName_Kids Channel', 'AdPlatformName_LinkedIn',
       'AdPlatformName_Movie Channel', 'AdPlatformName_Music Channel',
       'AdPlatformName_Netflix', 'AdPlatformName_News Channel',
       'AdPlatformName_Snapchat', 'AdPlatformName_Sports Channel',
       'AdPlatformName_Threads', 'AdPlatformName_Twitter',
       'AdPlatformName_Yahoo', 'AdPlatformName_YouTube', 'AdPlatformName_Zee5',
       'AdPlatformType_Search Engine', 'AdPlatformType_Social Media',
       'AdPlatformType_Streaming', 'AdPlatformType_TV', 'AdTopic_Automobile',
       'AdTopic_Education', 'AdTopic_Entertainment', 'AdTopic_Fashion',
       'AdTopic_Finance', 'AdTopic_Food', 'AdTopic_Healthcare',
       'AdTopic_Political', 'Ad

In [84]:

# List of columns to transform
columns_to_transform = [
    'AdPlatformName_Amazon Prime', 'AdPlatformName_Bing', 'AdPlatformName_Facebook', 
    'AdPlatformName_Google', 'AdPlatformName_Hotstar', 'AdPlatformName_Instagram', 
    'AdPlatformName_JioTV', 'AdPlatformName_Kids Channel', 'AdPlatformName_LinkedIn', 
    'AdPlatformName_Movie Channel', 'AdPlatformName_Music Channel', 'AdPlatformName_Netflix', 
    'AdPlatformName_News Channel', 'AdPlatformName_Snapchat', 'AdPlatformName_Sports Channel', 
    'AdPlatformName_Threads', 'AdPlatformName_Twitter', 'AdPlatformName_Yahoo', 
    'AdPlatformName_YouTube', 'AdPlatformName_Zee5', 'AdPlatformType_Search Engine', 
    'AdPlatformType_Social Media', 'AdPlatformType_Streaming', 'AdPlatformType_TV', 
    'AdTopic_Automobile', 'AdTopic_Education', 'AdTopic_Entertainment', 'AdTopic_Fashion', 
    'AdTopic_Finance', 'AdTopic_Food', 'AdTopic_Healthcare', 'AdTopic_Political', 
    'AdTopic_Real Estate', 'AdTopic_Sports', 'AdTopic_Technology', 'AdTopic_Travel', 
    'AdType_Banner', 'AdType_Text', 'AdType_Video'
]

# Convert True to 1 and False to 0 for the specified columns
for column in columns_to_transform:
    encoded_df[column] = encoded_df[column].astype(int)

In [85]:
print("Data types in merged DataFrame:")
print(encoded_df.dtypes)

Data types in merged DataFrame:
AdCost                           int64
PurchaseAmount                   int64
AdPlatformName_Amazon Prime      int64
AdPlatformName_Bing              int64
AdPlatformName_Facebook          int64
AdPlatformName_Google            int64
AdPlatformName_Hotstar           int64
AdPlatformName_Instagram         int64
AdPlatformName_JioTV             int64
AdPlatformName_Kids Channel      int64
AdPlatformName_LinkedIn          int64
AdPlatformName_Movie Channel     int64
AdPlatformName_Music Channel     int64
AdPlatformName_Netflix           int64
AdPlatformName_News Channel      int64
AdPlatformName_Snapchat          int64
AdPlatformName_Sports Channel    int64
AdPlatformName_Threads           int64
AdPlatformName_Twitter           int64
AdPlatformName_Yahoo             int64
AdPlatformName_YouTube           int64
AdPlatformName_Zee5              int64
AdPlatformType_Search Engine     int64
AdPlatformType_Social Media      int64
AdPlatformType_Streaming        

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the features and target variables
X = encoded_df.copy()
y_click_through_rate = click_through_rate
y_conversion_rate = conversion_rate

# Split the data into training and testing sets for click_through_rate
X_train_ctr, X_test_ctr, y_train_ctr, y_test_ctr = train_test_split(X, y_click_through_rate, test_size=0.2, random_state=42)

# Split the data into training and testing sets for conversion_rate
X_train_cr, X_test_cr, y_train_cr, y_test_cr = train_test_split(X, y_conversion_rate, test_size=0.2, random_state=42)

# Train the linear regression model for click_through_rate
model_ctr = LinearRegression()
model_ctr.fit(X_train_ctr, y_train_ctr)

# Train the linear regression model for conversion_rate
model_cr = LinearRegression()
model_cr.fit(X_train_cr, y_train_cr)

# Predict and evaluate the model for click_through_rate
y_pred_ctr = model_ctr.predict(X_test_ctr)
mse_ctr = mean_squared_error(y_test_ctr, y_pred_ctr)
r2_ctr = r2_score(y_test_ctr, y_pred_ctr)

print(f'Click Through Rate Model - Mean Squared Error: {mse_ctr}, R^2 Score: {r2_ctr}')

# Predict and evaluate the model for conversion_rate
y_pred_cr = model_cr.predict(X_test_cr)
mse_cr = mean_squared_error(y_test_cr, y_pred_cr)
r2_cr = r2_score(y_test_cr, y_pred_cr)

print(f'Conversion Rate Model - Mean Squared Error: {mse_cr}, R^2 Score: {r2_cr}')

Click Through Rate Model - Mean Squared Error: 0.005307639814641411, R^2 Score: -1.0341188360900682
Conversion Rate Model - Mean Squared Error: 0.002667069726033916, R^2 Score: -1.5116533301379267


In [87]:
from sklearn.linear_model import Lasso, Ridge

# Train the Lasso regression model for click_through_rate
lasso_ctr = Lasso()
lasso_ctr.fit(X_train_ctr, y_train_ctr)

# Train the Lasso regression model for conversion_rate
lasso_cr = Lasso()
lasso_cr.fit(X_train_cr, y_train_cr)

# Train the Ridge regression model for click_through_rate
ridge_ctr = Ridge()
ridge_ctr.fit(X_train_ctr, y_train_ctr)

# Train the Ridge regression model for conversion_rate
ridge_cr = Ridge()
ridge_cr.fit(X_train_cr, y_train_cr)

# Predict and evaluate the Lasso model for click_through_rate
y_pred_lasso_ctr = lasso_ctr.predict(X_test_ctr)
lasso_ctr_score = lasso_ctr.score(X_test_ctr, y_test_ctr) * 100
print(f'Lasso Click Through Rate Model Accuracy: {lasso_ctr_score:.2f}%')

# Predict and evaluate the Lasso model for conversion_rate
y_pred_lasso_cr = lasso_cr.predict(X_test_cr)
lasso_cr_score = lasso_cr.score(X_test_cr, y_test_cr) * 100
print(f'Lasso Conversion Rate Model Accuracy: {lasso_cr_score:.2f}%')

# Predict and evaluate the Ridge model for click_through_rate
y_pred_ridge_ctr = ridge_ctr.predict(X_test_ctr)
ridge_ctr_score = ridge_ctr.score(X_test_ctr, y_test_ctr) * 100
print(f'Ridge Click Through Rate Model Accuracy: {ridge_ctr_score:.2f}%')

# Predict and evaluate the Ridge model for conversion_rate
y_pred_ridge_cr = ridge_cr.predict(X_test_cr)
ridge_cr_score = ridge_cr.score(X_test_cr, y_test_cr) * 100
print(f'Ridge Conversion Rate Model Accuracy: {ridge_cr_score:.2f}%')

Lasso Click Through Rate Model Accuracy: 0.68%
Lasso Conversion Rate Model Accuracy: -17.60%
Ridge Click Through Rate Model Accuracy: -68.03%
Ridge Conversion Rate Model Accuracy: -98.93%


In [88]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Define the degree of the polynomial
degree = 5

# Create a pipeline for polynomial regression for click_through_rate
poly_model_ctr = make_pipeline(PolynomialFeatures(degree), LinearRegression())
poly_model_ctr.fit(X_train_ctr, y_train_ctr)

# Create a pipeline for polynomial regression for conversion_rate
poly_model_cr = make_pipeline(PolynomialFeatures(degree), LinearRegression())
poly_model_cr.fit(X_train_cr, y_train_cr)

# Predict and evaluate the polynomial model for click_through_rate
y_pred_poly_ctr = poly_model_ctr.predict(X_test_ctr)
poly_ctr_score = poly_model_ctr.score(X_test_ctr, y_test_ctr) * 100
print(f'Polynomial Click Through Rate Model Accuracy: {poly_ctr_score:.2f}%')

# Predict and evaluate the polynomial model for conversion_rate
y_pred_poly_cr = poly_model_cr.predict(X_test_cr)
poly_cr_score = poly_model_cr.score(X_test_cr, y_test_cr) * 100
print(f'Polynomial Conversion Rate Model Accuracy: {poly_cr_score:.2f}%')

Polynomial Click Through Rate Model Accuracy: -152689.76%
Polynomial Conversion Rate Model Accuracy: -698619.23%


In [89]:
from sklearn.neural_network import MLPRegressor

# Train the neural network model for click_through_rate
nn_ctr = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_ctr.fit(X_train_ctr, y_train_ctr)

# Train the neural network model for conversion_rate
nn_cr = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_cr.fit(X_train_cr, y_train_cr)

# Predict and evaluate the neural network model for click_through_rate
y_pred_nn_ctr = nn_ctr.predict(X_test_ctr)
nn_ctr_score = nn_ctr.score(X_test_ctr, y_test_ctr) * 100
print(f'Neural Network Click Through Rate Model Accuracy: {nn_ctr_score:.2f}%')

# Predict and evaluate the neural network model for conversion_rate
y_pred_nn_cr = nn_cr.predict(X_test_cr)
nn_cr_score = nn_cr.score(X_test_cr, y_test_cr) * 100
print(f'Neural Network Conversion Rate Model Accuracy: {nn_cr_score:.2f}%')

Neural Network Click Through Rate Model Accuracy: -294658607.99%
Neural Network Conversion Rate Model Accuracy: -724651998.93%


In [90]:
import pandas as pd
import numpy as np

# Get the coefficients of the linear regression model for click_through_rate
coefficients_ctr = model_ctr.coef_

# Get the coefficients of the linear regression model for conversion_rate
coefficients_cr = model_cr.coef_

# Create a DataFrame to store the feature names and their relevance for click_through_rate
relevance_ctr = pd.DataFrame({
    'Feature': X.columns,
    'Relevance': np.abs(coefficients_ctr)
})

# Create a DataFrame to store the feature names and their relevance for conversion_rate
relevance_cr = pd.DataFrame({
    'Feature': X.columns,
    'Relevance': np.abs(coefficients_cr)
})

# Sort the DataFrames by relevance in descending order
relevance_ctr = relevance_ctr.sort_values(by='Relevance', ascending=False)
relevance_cr = relevance_cr.sort_values(by='Relevance', ascending=False)

# Print the relevance of each column for predicting click_through_rate
print("Relevance of each column for predicting Click Through Rate:")
print(relevance_ctr)

# Print the relevance of each column for predicting conversion_rate
print("Relevance of each column for predicting Conversion Rate:")
print(relevance_cr)

Relevance of each column for predicting Click Through Rate:
                          Feature     Relevance
26             AdTopic_Automobile  5.940383e-02
29                AdTopic_Fashion  5.326989e-02
15        AdPlatformName_Snapchat  3.805990e-02
3             AdPlatformName_Bing  3.726038e-02
31                   AdTopic_Food  3.539511e-02
8            AdPlatformName_JioTV  3.529130e-02
5           AdPlatformName_Google  3.131084e-02
28          AdTopic_Entertainment  2.707958e-02
4         AdPlatformName_Facebook  2.683423e-02
16  AdPlatformName_Sports Channel  2.552322e-02
27              AdTopic_Education  2.534501e-02
13         AdPlatformName_Netflix  2.261111e-02
12   AdPlatformName_Music Channel  2.149003e-02
20         AdPlatformName_YouTube  2.098853e-02
32             AdTopic_Healthcare  1.874538e-02
38                  AdType_Banner  1.770093e-02
35                 AdTopic_Sports  1.735190e-02
21            AdPlatformName_Zee5  1.678884e-02
2     AdPlatformName_Amazon 

In [91]:
import joblib

# Train the linear regression model for click_through_rate
model_ctr = LinearRegression()
model_ctr.fit(X_train_ctr, y_train_ctr)

# Train the linear regression model for conversion_rate
model_cr = LinearRegression()
model_cr.fit(X_train_cr, y_train_cr)

# Save the models using joblib
joblib.dump(model_ctr, '/mnt/c/de/project/Advertisement-Response-Analysis/Backend/api/models/model_ctr.joblib')
joblib.dump(model_cr, '/mnt/c/de/project/Advertisement-Response-Analysis/Backend/api/models/model_cr.joblib')

print("Models saved successfully.")

Models saved successfully.
