In [70]:
import numpy as np 
import pandas as pd


In [71]:
df=pd.read_csv('customer_acquisition_data.csv')

In [72]:
df.head()

Unnamed: 0,customer_id,channel,cost,conversion_rate,revenue
0,1,referral,8.320327,0.123145,4199
1,2,paid advertising,30.450327,0.016341,3410
2,3,email marketing,5.246263,0.043822,3164
3,4,social media,9.546326,0.167592,1520
4,5,referral,8.320327,0.123145,2419


In [73]:
one_hot_encoded=pd.get_dummies(df['channel'],prefix='channel',dtype=int)

In [74]:
one_hot_encoded


Unnamed: 0,channel_email marketing,channel_paid advertising,channel_referral,channel_social media
0,0,0,1,0
1,0,1,0,0
2,1,0,0,0
3,0,0,0,1
4,0,0,1,0
...,...,...,...,...
795,0,0,0,1
796,1,0,0,0
797,0,0,0,1
798,0,1,0,0


In [75]:
df_encoded=pd.concat([df,one_hot_encoded],axis=1)

In [76]:
df_encoded


Unnamed: 0,customer_id,channel,cost,conversion_rate,revenue,channel_email marketing,channel_paid advertising,channel_referral,channel_social media
0,1,referral,8.320327,0.123145,4199,0,0,1,0
1,2,paid advertising,30.450327,0.016341,3410,0,1,0,0
2,3,email marketing,5.246263,0.043822,3164,1,0,0,0
3,4,social media,9.546326,0.167592,1520,0,0,0,1
4,5,referral,8.320327,0.123145,2419,0,0,1,0
...,...,...,...,...,...,...,...,...,...
795,796,social media,9.546326,0.167592,2813,0,0,0,1
796,797,email marketing,5.246263,0.043822,3439,1,0,0,0
797,798,social media,9.546326,0.167592,2101,0,0,0,1
798,799,paid advertising,30.450327,0.016341,813,0,1,0,0


In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [78]:

if 'revenue' in df_encoded.columns:
    revenue_column = df_encoded.pop('revenue')
    df_encoded['revenue'] = revenue_column # here we are adding the revenue column at the end ......
else:
    print("'revenue' column not found in DataFrame.")


In [79]:
df_encoded

Unnamed: 0,customer_id,channel,cost,conversion_rate,channel_email marketing,channel_paid advertising,channel_referral,channel_social media,revenue
0,1,referral,8.320327,0.123145,0,0,1,0,4199
1,2,paid advertising,30.450327,0.016341,0,1,0,0,3410
2,3,email marketing,5.246263,0.043822,1,0,0,0,3164
3,4,social media,9.546326,0.167592,0,0,0,1,1520
4,5,referral,8.320327,0.123145,0,0,1,0,2419
...,...,...,...,...,...,...,...,...,...
795,796,social media,9.546326,0.167592,0,0,0,1,2813
796,797,email marketing,5.246263,0.043822,1,0,0,0,3439
797,798,social media,9.546326,0.167592,0,0,0,1,2101
798,799,paid advertising,30.450327,0.016341,0,1,0,0,813


In [80]:
X = df_encoded[['cost', 'conversion_rate', 'channel_email marketing', 'channel_paid advertising', 'channel_referral', 'channel_social media']]
y = df_encoded['revenue']


In [81]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

In [83]:
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
#Due to the error being so high a linear regression model is not that well suite for the prediction....

Mean Squared Error: 1682937.4653210626
R-squared: -0.021139505374697


In [84]:
test_input = pd.DataFrame({
    'cost': [15.2, 25.8],  # Example cost values
    'conversion_rate': [0.07, 0.03],  # Example conversion rate values
    'channel_email marketing': [1, 0],  # Example one-hot encoded channel values
    'channel_paid advertising': [0, 1],
    'channel_referral': [0, 0],
    'channel_social media': [0, 0]
})

In [85]:
predicted_revenue = model.predict(test_input)
print("Predicted Revenue:")
for i, revenue in enumerate(predicted_revenue):
    print(f"Test Observation {i+1}: ${revenue:.2f}")

Predicted Revenue:
Test Observation 1: $2913.77
Test Observation 2: $2863.90
