In [21]:
# Import necessary libraries
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Function to preprocess the data
def preprocess_data(df):
    """
    Preprocess the data by converting categorical columns to numerical ones.
    """
    df['sex'] = df['sex'].map({'Female': 0, 'Male': 1}) # Convert 'sex' column to numerical
    df['smoker'] = df['smoker'].map({'No': 0, 'Yes': 1}) # Convert 'smoker' column to numerical
    df['day'] = df['day'].map({'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3}) # Convert 'day' column to numerical
    df['time'] = df['time'].map({'Lunch': 0, 'Dinner': 1}) # Convert 'time' column to numerical
    return df

# Function to split the data into training and testing sets
def split_data(df):
    """
    Split the data into features (X) and target (y), then into training and test sets.
    """
    X = df.drop('tip', axis=1) # Features are all columns except 'tip'
    y = df['tip'] # Target is the 'tip' column
    return train_test_split(X, y, test_size=0.2, random_state=42) # Split the data

# Function to train a Linear Regression model
def train_model(X_train, y_train):
    """
    Train a Linear Regression model on the training data.
    """
    model = LinearRegression()
    model.fit(X_train, y_train) # Train the model
    return model

# Function to evaluate the model using Mean Squared Error (MSE)
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model using Mean Squared Error.
    """
    predictions = model.predict(X_test) # Predict on the test set
    mse = mean_squared_error(y_test, predictions) # Calculate MSE
    return mse

# Load dataset
tips = sns.load_dataset("tips") # Load the 'tips' dataset from seaborn


In [22]:
# Import pytest for testing
import pytest

# Test for preprocess_data function
def test_preprocess_data():
    df = sns.load_dataset("tips") # Load dataset
    df = preprocess_data(df) # Preprocess the data
    # Assertions to check if preprocessing is correct
    assert 'Female' not in df['sex'].values, "Sex column not processed correctly"
    assert 'No' not in df['smoker'].values, "Smoker column not processed correctly"
    assert 'Sun' not in df['day'].values, "Day column not processed correctly"
    assert 'Dinner' not in df['time'].values, "Time column not processed correctly"

# Test for split_data function
def test_split_data():
    df = sns.load_dataset("tips") # Load dataset
    df = preprocess_data(df) # Preprocess the data
    X_train, X_test, y_train, y_test = split_data(df) # Split the data
    # Assertions to check if the split is correct
    assert len(X_train) > 0, "Training set is empty"
    assert len(X_test) > 0, "Test set is empty"

# Test for train_model function
def test_train_model():
    df = sns.load_dataset("tips") # Load dataset
    df = preprocess_data(df) # Preprocess the data
    X_train, X_test, y_train, y_test = split_data(df) # Split the data
    model = train_model(X_train, y_train) # Train the model
    # Assertion to check if the model is a Linear Regression model
    assert isinstance(model, LinearRegression), "Model is not a Linear Regression model"

# Test for evaluate_model function
def test_evaluate_model():
    df = sns.load_dataset("tips") # Load dataset
    df = preprocess_data(df) # Preprocess the data
    X_train, X_test, y_train, y_test = split_data(df) # Split the data
    model = train_model(X_train, y_train) # Train the model
    mse = evaluate_model(model, X_test, y_test) # Evaluate the model
    # Assertion to check if the MSE is non-negative
    assert mse >= 0, "Mean Squared Error should be non-negative"

# Run the tests
test_preprocess_data()
test_split_data()
test_train_model()
test_evaluate_model()

print("All tests passed!")


All tests passed!
