In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn import tree
import plotly.express as px
from sklearn.tree import DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
!pip install Faker
from faker import Faker
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures



In [None]:
# Set a seed for reproducibility
np.random.seed(0)

# Define the number of users and the range for the counts
num_users = 1000
max_posts = 10
max_friends = 15
max_events = 5
max_points = 50

# Generate random counts for posts, friends, and events
posts_counts = np.random.randint(0, max_posts + 1, num_users)
friends_counts = np.random.randint(0, max_friends + 1, num_users)
events_counts = np.random.randint(0, max_events + 1, num_users)

# Generate a random points value for each user
points_values = np.random.randint(0, max_points + 1, num_users)

# Create a DataFrame for synthetic data
synthetic_data = pd.DataFrame({
    'UserID': range(1, num_users + 1),
    'DistinctPostIDCount': posts_counts,
    'DistinctFriendIDCount': friends_counts,
    'DistinctEventIDCount': events_counts,
    'SumPointsValue': points_values
})

# Calculate the engagement score based on the provided weights
synthetic_data['CalculatedEngagementScore'] = (
    0.4 * synthetic_data['DistinctPostIDCount'] +
    0.3 * synthetic_data['DistinctFriendIDCount'] +
    0.2 * synthetic_data['DistinctEventIDCount'] +
    0.1 * synthetic_data['SumPointsValue']
)

# Feature Engineering
synthetic_data['PostsPerFriend'] = synthetic_data['DistinctPostIDCount'] / (synthetic_data['DistinctFriendIDCount'] + 1)  # Avoid division by zero
synthetic_data['EventsTimesPoints'] = synthetic_data['DistinctEventIDCount'] * synthetic_data['SumPointsValue']

# Shuffle the DataFrame to remove any implicit order
synthetic_data = synthetic_data.sample(frac=1).reset_index(drop=True)

# Generating the complex engagement score
def generate_complex_engagement_score(row):
    score = (row['DistinctPostIDCount'] ** 1.5 + 
             np.sqrt(row['DistinctFriendIDCount']) * 2 + 
             np.log1p(row['DistinctEventIDCount']) * 3 + 
             np.sqrt(row['SumPointsValue']) * 1.2 +
             np.random.normal(0, 2))  # Random noise
    return max(0, score)  # Ensuring score is non-negative

synthetic_data['ComplexEngagementScore'] = synthetic_data.apply(generate_complex_engagement_score, axis=1)

# Optionally save the DataFrame to a CSV file
synthetic_data.to_csv('synthetic_data_with_engagement_score.csv', index=False)

synthetic_data.head()

In [None]:
# Preparing the data for the new linear regression model
X = synthetic_data[['DistinctPostIDCount', 'DistinctFriendIDCount', 'DistinctEventIDCount', 'SumPointsValue']]
y = synthetic_data['ComplexEngagementScore']

# Splitting the dataset into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating a new model
new_model = LinearRegression()

# Fitting the model
new_model.fit(X_train, y_train)

# Making predictions
y_pred_new = new_model.predict(X_test)

# Evaluating the new model
rmse_new = mean_squared_error(y_test, y_pred_new, squared=False)
r2_new = r2_score(y_test, y_pred_new)

# Coefficients and intercept of the new model
coefficients_new = new_model.coef_
intercept_new = new_model.intercept_

rmse_new, r2_new, coefficients_new, intercept_new

In [None]:
# Plotting the actual vs predicted values for the new model

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_new, color='red')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Complex Engagement Scores')
plt.show()

In [None]:
# Polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=0)

# Training the model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Making predictions
y_pred_poly = poly_model.predict(X_test_poly)

# Evaluating the model
rmse_poly = mean_squared_error(y_test, y_pred_poly, squared=False)
r2_poly = r2_score(y_test, y_pred_poly)

# Results
rmse_poly, r2_poly

In [None]:
# Features for visualization
features = ['DistinctPostIDCount', 'DistinctFriendIDCount', 'DistinctEventIDCount', 'SumPointsValue']

# Creating subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()

# Polynomial transformation
poly = PolynomialFeatures(degree=2)

for i, feature in enumerate(features):
    # Define the model
    model = LinearRegression()

    # Selecting the feature and target for plotting
    x = synthetic_data[feature].values.reshape(-1, 1)
    y = synthetic_data['ComplexEngagementScore'].values

    # Applying polynomial transformation
    x_poly = poly.fit_transform(x)

    # Fitting the model
    model.fit(x_poly, y)

    # Predictions for a range of values
    x_range = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
    x_range_poly = poly.transform(x_range)
    predictions = model.predict(x_range_poly)

    # Plotting
    axes[i].scatter(x, y, color='blue', label='Original Data')
    axes[i].plot(x_range, predictions, color='red', label=f'Polynomial Degree {degree}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('ComplexEngagementScore')
    axes[i].set_title(f'Effect of Polynomial Transformation on {feature}')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Creating Ridge and Lasso regression models
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)

# Fitting the models
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

# Making predictions
y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)

# Evaluating the models
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
r2_ridge = r2_score(y_test, y_pred_ridge)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Coefficients and intercepts
coefficients_ridge = ridge_model.coef_
intercept_ridge = ridge_model.intercept_
coefficients_lasso = lasso_model.coef_
intercept_lasso = lasso_model.intercept_

# Preparing the results for display
results = {
    "Model": ["Linear Regression", "Ridge Regression", "Lasso Regression"],
    "RMSE": [rmse_new, rmse_ridge, rmse_lasso],
    "R2": [r2_new, r2_ridge, r2_lasso],
    "Coefficients": [coefficients_new, coefficients_ridge, coefficients_lasso],
    "Intercept": [intercept_new, intercept_ridge, intercept_lasso]
}

pd.DataFrame(results)

In [None]:
# Plotting the actual vs predicted values for Ridge Regression
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_ridge, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted by Ridge')
plt.title('Ridge Regression: Actual vs Predicted')
plt.show()

In [None]:
# Plotting the actual vs predicted values for Lasso Regression
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lasso, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted by Lasso')
plt.title('Lasso Regression: Actual vs Predicted')
plt.show()

In [None]:
# Alpha values to try
alphas = np.logspace(-4, 4, 9)

# Initializing dictionaries to store results
ridge_coefficients = {}
lasso_coefficients = {}

# Ridge Regression
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    ridge_coefficients[alpha] = ridge.coef_

# Lasso Regression
for alpha in alphas:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    lasso_coefficients[alpha] = lasso.coef_

# Preparing data for plotting
ridge_coefs = pd.DataFrame(ridge_coefficients, index=X.columns).T
lasso_coefs = pd.DataFrame(lasso_coefficients, index=X.columns).T

ridge_coefs, lasso_coefs

In [None]:
# Alpha values for Ridge regression
alphas = np.logspace(-4, 4, 9)

# Dictionary to store coefficients
ridge_coefficients = {}

# Perform Ridge regression for different alpha values
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    ridge_coefficients[alpha] = ridge.coef_

# Convert coefficients to DataFrame for plotting
ridge_coefs = pd.DataFrame(ridge_coefficients, index=X_train.columns).T

# Plotting
plt.figure(figsize=(10, 6))
for column in ridge_coefs.columns:
    plt.plot(ridge_coefs.index, ridge_coefs[column], label=column)
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Ridge Coefficients as a Function of the Regularization')
plt.legend()
plt.show()

In [None]:
# Dictionary to store coefficients
lasso_coefficients = {}

# Perform Lasso regression for different alpha values
for alpha in alphas:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    lasso_coefficients[alpha] = lasso.coef_

# Convert coefficients to DataFrame for plotting
lasso_coefs = pd.DataFrame(lasso_coefficients, index=X_train.columns).T

# Plotting
plt.figure(figsize=(10, 6))
for column in lasso_coefs.columns:
    plt.plot(lasso_coefs.index, lasso_coefs[column], label=column)
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Lasso Coefficients as a Function of the Regularization')
plt.legend()
plt.show()