<a href="https://colab.research.google.com/github/alainrafiki/datasciencedemos/blob/main/ridge_and_lasso_regression_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# **1. Synthetic Dataset Creation**


def generate_synthetic_translation_data(num_rows=10000, num_features=30):
    """
    Generates a synthetic dataset with features related to Bible translation.

    Args:
        num_rows: Number of rows in the dataset.
        num_features: Number of features in the dataset.

    Returns:
        pandas DataFrame: Synthetic dataset with features and target.
    """

    data = pd.DataFrame()

    # Generate some linguistic features (example)
    data['lexical_diversity'] = np.random.uniform(0.5, 0.8, num_rows)
    data['avg_sentence_length'] = np.random.randint(10, 30, num_rows)
    data['noun_ratio'] = np.random.uniform(0.2, 0.4, num_rows)

    # Generate some cross-lingual features (example)
    data['word_alignment_score'] = np.random.uniform(0.7, 0.95, num_rows)
    data['lexical_overlap'] = np.random.uniform(0.5, 0.8, num_rows)

    # Generate some translation process features (example)
    data['translator_experience'] = np.random.randint(1, 10, num_rows)  # Simulate experience levels
    data['translation_time'] = np.random.randint(30, 180, num_rows)  # Simulate translation time (in minutes)

    # Generate a synthetic quality score (replace with your actual quality assessment method)
    data['quality_score'] = (
        0.5 * data['word_alignment_score']
        + 0.3 * data['lexical_diversity']
        + 0.2 * (1 / data['translation_time'])  # Inversely proportional to time
        + np.random.normal(0, 0.1, num_rows)  # Add some noise
    )

    # Generate remaining features randomly
    for i in range(num_features - len(data.columns)):
        data[f'feature_{i}'] = np.random.rand(num_rows)

    return data

# Generate the synthetic dataset
df = generate_synthetic_translation_data()

In [3]:
# **2. Data Preparation**

X = df.drop('quality_score', axis=1)  # Features
y = df['quality_score']             # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# **3. Ridge Regression**

# Define a parameter grid for alpha values
ridge_param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

# Create a Ridge Regression model
ridge_model = Ridge()

# Perform grid search with cross-validation
ridge_grid_search = GridSearchCV(estimator=ridge_model,
                                param_grid=ridge_param_grid,
                                cv=5,
                                scoring='neg_mean_squared_error')
ridge_grid_search.fit(X_train, y_train)

# Get the best alpha value
best_ridge_alpha = ridge_grid_search.best_params_['alpha']
print(f"Best Ridge Alpha: {best_ridge_alpha}")

# Train the Ridge model with the best alpha
best_ridge_model = Ridge(alpha=best_ridge_alpha)
best_ridge_model.fit(X_train, y_train)

# Make predictions
ridge_predictions = best_ridge_model.predict(X_test)
print(f"Ridge Predictions: {ridge_predictions}")

# Evaluate the model
ridge_mse = mean_squared_error(y_test, ridge_predictions)
print(f"Ridge Regression MSE: {ridge_mse}")


Best Ridge Alpha: 0.1
Ridge Predictions: [0.56789046 0.60626402 0.65480948 ... 0.57536661 0.56820461 0.57932912]
Ridge Regression MSE: 0.010152787550912245


In [5]:
# **4. Lasso Regression**

# Define a parameter grid for alpha values
lasso_param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0]}

# Create a Lasso Regression model
lasso_model = Lasso()

# Perform grid search with cross-validation
lasso_grid_search = GridSearchCV(estimator=lasso_model,
                                param_grid=lasso_param_grid,
                                cv=5,
                                scoring='neg_mean_squared_error')
lasso_grid_search.fit(X_train, y_train)

# Get the best alpha value
best_lasso_alpha = lasso_grid_search.best_params_['alpha']
print(f"Best Lasso Alpha: {best_lasso_alpha}")

# Train the Lasso model with the best alpha
best_lasso_model = Lasso(alpha=best_lasso_alpha)
best_lasso_model.fit(X_train, y_train)

# Make predictions
lasso_predictions = best_lasso_model.predict(X_test)
print(f"Lasso Predictions: {lasso_predictions}")

# Evaluate the model
lasso_mse = mean_squared_error(y_test, lasso_predictions)
print(f"Lasso Regression MSE: {lasso_mse}")

Best Lasso Alpha: 0.001
Lasso Predictions: [0.57801794 0.60777354 0.63668328 ... 0.58976468 0.58639185 0.59599511]
Lasso Regression MSE: 0.01045077282599916


In [6]:
# **5. Feature Selection (Lasso)**

selected_features = X.columns[best_lasso_model.coef_ != 0]
print("Features selected by Lasso:\n", list(selected_features))
print("That is %.1f features" % len(selected_features) )

Features selected by Lasso:
 ['lexical_diversity', 'avg_sentence_length', 'word_alignment_score', 'translation_time']
That is 4.0 features


In [7]:
# Preprare a little function for pretty printing a long list of features
import textwrap

def print_wrapped_list(title, my_list, width=6):
    print(title)
    wrapped_list = textwrap.wrap(', '.join(map(str, my_list)), width=width*12)
    for line in wrapped_list:
      print(line)
    print("That is %.1f features" % len(my_list))

In [8]:
# **6. Feature Section (Ridge)**

selected_features_ridge = X.columns[best_ridge_model.coef_ != 0]
print_wrapped_list("Features selected by Ridge:\n", list(selected_features_ridge))

Features selected by Ridge:

lexical_diversity, avg_sentence_length, noun_ratio,
word_alignment_score, lexical_overlap, translator_experience,
translation_time, feature_0, feature_1, feature_2, feature_3, feature_4,
feature_5, feature_6, feature_7, feature_8, feature_9, feature_10,
feature_11, feature_12, feature_13, feature_14, feature_15, feature_16,
feature_17, feature_18, feature_19, feature_20, feature_21
That is 29.0 features
