In [None]:
# 2.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate sample data
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)  # y = 4 + 3x + noise

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate error metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the results
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

In [None]:
3.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate sample data
np.random.seed(0)
X1 = 2 * np.random.rand(100, 1)
X2 = 2 * np.random.rand(100, 1)
y = 4 + 3 * X1 + 2 * X2 + np.random.randn(100, 1)  # y = 4 + 3x1 + 2x2 + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X1, X2, y)), columns=['X1', 'X2', 'y'])

# Split the data into training and testing sets
X = data[['X1', 'X2']]
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate residuals
residuals = y_test - y_pred

# 1. Check for linearity
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_test['X1'], y_test, color='blue', label='Actual')
plt.scatter(X_test['X1'], y_pred, color='red', label='Predicted')
plt.title('Linearity Check')
plt.xlabel('X1')
plt.ylabel('y')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(X_test['X2'], y_test, color='blue', label='Actual')
plt.scatter(X_test['X2'], y_pred, color='red', label='Predicted')
plt.title('Linearity Check')
plt.xlabel('X2')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

# 2. Check for homoscedasticity
plt.figure(figsize=(6, 5))
plt.scatter(y_pred, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

# 3. Check for multicollinearity
correlation_matrix = data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Print error metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

In [None]:
4.

import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate synthetic regression data
X, y = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)

# Create a DataFrame
data = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
data['target'] = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.2, random_state=42)

# Define a list of models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

# Initialize a dictionary to store the results
results = {}

# Evaluate each model
for model_name, model in models.items():
    # Create a pipeline with feature scaling
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate error metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Store the results
    results[model_name] = {
        'Mean Squared Error': mse,
        'Mean Absolute Error': mae
    }

# Print the results
for model_name, metrics in results.items():

    print(f"{model_name}:")

    print(f"  Mean Squared Error (MSE): {metrics['Mean Squared Error']:.4f}")

    print(f"  Mean Absolute Error (MAE): {metrics['Mean Absolute Error']:.4f}")

    print()

In [None]:
5.

import numpy as np

import pandas as pd

from sklearn.datasets import make_regression

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

# Generate synthetic regression data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

# Create a DataFrame
data = pd.DataFrame(X, columns=['Feature'])
data['Target'] = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Get model coefficients and intercept
coefficients = model.coef_

intercept = model.intercept_

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the results
print(f"Coefficients: {coefficients[0]:.4f}")

print(f"Intercept: {intercept:.4f}")

print(f"R-squared score: {r_squared:.4f}")

In [None]:
6.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the tips dataset from seaborn
tips = sns.load_dataset('tips')

# Display the first few rows of the dataset
print(tips.head())

# Prepare the data
X = tips[['total_bill']]  # Independent variable
y = tips['tip']           # Dependent variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the model coefficients and intercept
print(f"Coefficient: {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R-squared score: {r_squared:.4f}")

# Visualization
plt.figure(figsize=(10, 6))

# Scatter plot of total bill vs tip
sns.scatterplot(data=tips, x='total_bill', y='tip', color='blue', label='Data Points')

# Plot the regression line
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line')

# Add labels and title
plt.title('Total Bill vs Tip')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip ($)')
plt.legend()
plt.grid()
plt.show()

In [None]:
7.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)  # 100 samples, 1 feature
y = 4 + 3 * X + np.random.randn(100, 1)  # y = 4 + 3x + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate values for the regression line
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
y_line = model.predict(X_line)

# Plot the data points and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Regression Line')
plt.title('Linear Regression on Synthetic Data')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficient: {model.coef_[0][0]:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

In [None]:
9.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)  # 100 samples, 1 feature
y = 4 + 3 * X + 1.5 * X**2 + np.random.randn(100, 1)  # Quadratic relationship with noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create polynomial features
poly_features = PolynomialFeatures(degree=2)
X_poly_train = poly_features.fit_transform(X_train)

# Create and fit the polynomial regression model
model = LinearRegression()
model.fit(X_poly_train, y_train)

# Generate values for the regression curve
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
X_line_poly = poly_features.transform(X_line)  # Transform to polynomial features
y_line = model.predict(X_line_poly)

# Plot the data points and the regression curve
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Polynomial Regression Curve (Degree 2)')
plt.title('Polynomial Regression (Degree 2)')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

In [None]:
10.

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X = np.random.rand(100, 1) * 10  # 100 random values for X between 0 and 10
y = 2.5 * X + np.random.randn(100, 1) * 2  # y = 2.5 * X + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Get the model's coefficient and intercept
coefficient = model.coef_[0][0]
intercept = model.intercept_[0]

# Print the results
print(f"Coefficient: {coefficient:.4f}")
print(f"Intercept: {intercept:.4f}")

In [None]:
11.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Generate synthetic data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)  # 100 samples, 1 feature
y = 4 + 3 * X + 1.5 * X**2 + np.random.randn(100, 1)  # Quadratic relationship with noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Degrees of polynomial to test
degrees = [1, 2, 3]
models = {}
r2_scores = {}

# Fit polynomial regression models of different degrees
for degree in degrees:
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree)
    X_poly_train = poly_features.fit_transform(X_train)
    X_poly_test = poly_features.transform(X_test)

    # Create and fit the linear regression model
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

    # Make predictions
    y_pred = model.predict(X_poly_test)

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    r2_scores[degree] = r2
    models[degree] = model

    # Print the R-squared score
    print(f"Degree {degree} Polynomial Regression R-squared: {r2:.4f}")

# Visualization
plt.figure(figsize=(12, 8))
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2

for degree in degrees:
    poly_features = PolynomialFeatures(degree=degree)

    X_line_poly = poly_features.fit_transform(X_line)

    y_line = models[degree].predict(X_line_poly)

    plt.plot(X_line, y_line, label=f'Degree {degree} Polynomial')

# Scatter plot of the original data
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points', alpha=0.5)

plt.title('Polynomial Regression of Different Degrees')

plt.xlabel('Feature')

plt.ylabel('Target')

plt.legend()

plt.grid()

plt.show()

In [None]:
12.

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X1 = np.random.rand(100, 1) * 10  # Feature 1: 100 random values between 0 and 10
X2 = np.random.rand(100, 1) * 5   # Feature 2: 100 random values between 0 and 5
y = 3 + 2 * X1 + 1.5 * X2 + np.random.randn(100, 1)  # y = 3 + 2*X1 + 1.5*X2 + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X1, X2, y)), columns=['Feature1', 'Feature2', 'Target'])

# Split the data into training and testing sets
X = data[['Feature1', 'Feature2']]
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Get the model's coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the results
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept:.4f}")
print(f"R-squared score: {r_squared:.4f}")

In [None]:
13.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 * np.random.rand(100, 1)  # 100 random values for X between 0 and 2
y = 4 + 3 * X + np.random.randn(100, 1)  # y = 4 + 3x + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate values for the regression line
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
y_line = model.predict(X_line)

# Plot the data points and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Regression Line')
plt.title('Linear Regression on Synthetic Data')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficient: {model.coef_[0][0]:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

In [None]:
14.

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset
data = pd.read_csv('your_dataset.csv')  # Replace with your dataset path

# Display the first few rows of the dataset
print(data.head())

# Create dummy variables for categorical features if necessary
# Example: data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

# Define the independent variables
X = data[['Feature1', 'Feature2', 'Feature3']]  # Replace with your feature names

# Create a DataFrame to hold VIF values
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# Calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

# Display the VIF values
print(vif_data)

In [None]:
15.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 * np.random.rand(100, 1)  # 100 random values for X between 0 and 2
y = 1 + 2 * X + 3 * X**2 - 4 * X**3 + 5 * X**4 + np.random.randn(100, 1) * 2  # Polynomial relationship with noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create polynomial features of degree 4
poly_features = PolynomialFeatures(degree=4)
X_poly_train = poly_features.fit_transform(X_train)

# Create and fit the polynomial regression model
model = LinearRegression()
model.fit(X_poly_train, y_train)

# Generate values for the regression curve
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
X_line_poly = poly_features.transform(X_line)  # Transform to polynomial features
y_line = model.predict(X_line_poly)

# Plot the data points and the regression curve
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Polynomial Regression Curve (Degree 4)')
plt.title('Polynomial Regression (Degree 4)')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

In [None]:
16.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X1 = np.random.rand(100, 1) * 10  # Feature 1: 100 random values between 0 and 10
X2 = np.random.rand(100, 1) * 5   # Feature 2: 100 random values between 0 and 5
y = 3 + 2 * X1 + 1.5 * X2 + np.random.randn(100, 1) * 2  # y = 3 + 2*X1 + 1.5*X2 + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X1, X2, y)), columns=['Feature1', 'Feature2', 'Target'])

# Split the data into training and testing sets
X = data[['Feature1', 'Feature2']]
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a machine learning pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize the data
    ('regressor', LinearRegression())  # Step 2: Fit the linear regression model
])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the R-squared score
print(f"R-squared score: {r_squared:.4f}")

In [None]:
17.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 * np.random.rand(100, 1)  # 100 random values for X between 0 and 2
y = 1 - 2 * X + 3 * X**2 - 4 * X**3 + np.random.randn(100, 1) * 2  # Polynomial relationship with noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create polynomial features of degree 3
poly_features = PolynomialFeatures(degree=3)
X_poly_train = poly_features.fit_transform(X_train)

# Create and fit the polynomial regression model
model = LinearRegression()
model.fit(X_poly_train, y_train)

# Generate values for the regression curve
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
X_line_poly = poly_features.transform(X_line)  # Transform to polynomial features
y_line = model.predict(X_line_poly)

# Plot the data points and the regression curve
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Polynomial Regression Curve (Degree 3)')
plt.title('Polynomial Regression (Degree 3)')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

In [None]:
18.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
np.random.seed(42)  # For reproducibility
n_samples = 100
X = np.random.rand(n_samples, 5) * 10  # 100 samples, 5 features (values between 0 and 10)
coefficients = np.array([1.5, -2.0, 3.0, 0.5, -1.0])  # Coefficients for the features
y = 5 + X @ coefficients + np.random.randn(n_samples) * 2  # y = 5 + linear combination + noise

# Create a DataFrame
data = pd.DataFrame(X, columns=[f'Feature{i+1}' for i in range(5)])
data['Target'] = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the R-squared score and model coefficients
print(f"R-squared score: {r_squared:.4f}")
print(f"Model coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_:.4f}")

In [None]:
19.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 * np.random.rand(100, 1)  # 100 random values for X between 0 and 2
y = 3 + 4 * X + np.random.randn(100, 1)  # y = 3 + 4x + noise

# Create a DataFrame
data = pd.DataFrame(np.hstack((X, y)), columns=['Feature', 'Target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['Feature']], data['Target'], test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate values for the regression line
X_line = np.linspace(0, 2, 100).reshape(-1, 1)  # 100 points from 0 to 2
y_line = model.predict(X_line)

# Plot the data points and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(data['Feature'], data['Target'], color='blue', label='Data Points')
plt.plot(X_line, y_line, color='red', linewidth=2, label='Regression Line')
plt.title('Linear Regression on Synthetic Data')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid()
plt.show()

# Print model coefficients and intercept
print(f"Coefficient: {model.coef_[0][0]:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

In [None]:
20.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
np.random.seed(42)  # For reproducibility
n_samples = 100
X1 = np.random.rand(n_samples) * 10  # Feature 1: 100 random values between 0 and 10
X2 = np.random.rand(n_samples) * 5   # Feature 2: 100 random values between 0 and 5
X3 = np.random.rand(n_samples) * 20  # Feature 3: 100 random values between 0 and 20

# Coefficients for the features
coefficients = np.array([1.5, -2.0, 3.0])
# Generate target variable with some noise
y = 5 + 1.5 * X1 - 2.0 * X2 + 3.0 * X3 + np.random.randn(n_samples) * 2

# Create a DataFrame
data = pd.DataFrame({
    'Feature1': X1,
    'Feature2': X2,
    'Feature3': X3,
    'Target': y
})

# Split the data into training and testing sets
X = data[['Feature1', 'Feature2', 'Feature3']]
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R-squared score
r_squared = r2_score(y_test, y_pred)

# Print the R-squared score and model coefficients
print(f"R-squared score: {r_squared:.4f}")
print(f"Model coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_:.4f}")

In [None]:
21.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
import joblib

# Generate a synthetic dataset
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Serialize (save) the model using joblib
model_filename = 'linear_regression_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

# Deserialize (load) the model
loaded_model = joblib.load(model_filename)
print("Model loaded successfully.")

# Make predictions using the loaded model
y_pred = loaded_model.predict(X_test)

# Print the predictions
print("Predictions:", y_pred)

# Optionally, you can also evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
22.

import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Load the tips dataset
tips = sns.load_dataset('tips')

# Display the first few rows of the dataset
print(tips.head())

# One-hot encode the categorical features
tips_encoded = pd.get_dummies(tips, columns=['sex', 'smoker', 'day', 'time'], drop_first=True)

# Define the independent variables (X) and the dependent variable (y)
X = tips_encoded.drop('total_bill', axis=1)  # Independent variables
y = tips_encoded['total_bill']  # Dependent variable

# Add a constant to the independent variables for the intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

# Optional: Visualize the relationship between total_bill and other features
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='sex')
plt.title('Total Bill vs Tip by Sex')
plt.xlabel('Total Bill')
plt.ylabel('Tip')
plt.show()

In [None]:
23.

# Fit Ridge Regression model
ridge_reg = Ridge(alpha=1.0)  # You can adjust alpha for regularization strength
ridge_reg.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_reg.predict(X_test)

# Calculate R-squared score
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print coefficients and R-squared score
print("Ridge Regression Coefficients:", ridge_reg.coef_)
print("Ridge Regression Intercept:", ridge_reg.intercept_)
print("Ridge Regression R-squared:", r2_ridge)

In [None]:
24.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
X = np.random.rand(100, 1) * 10  # 100 samples, single feature
y = 2.5 * X.squeeze() + np.random.randn(100) * 2  # Linear relationship with noise

# Create a DataFrame for better visualization (optional)
data = pd.DataFrame(data={'Feature': X.squeeze(), 'Target': y})
print(data.head())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE
cv_scores = -cv_scores

# Print cross-validation results
print("Cross-Validation Mean Squared Errors:", cv_scores)
print("Mean CV MSE:", np.mean(cv_scores))

# Fit the model on the entire training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print the Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error:", test_mse)

# Print model coefficients and intercept
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

In [None]:
25.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
X = np.random.rand(100, 1) * 10  # 100 samples, single feature
y = 2.5 * (X.squeeze() ** 2) + np.random.randn(100) * 5  # Quadratic relationship with noise

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List to store R-squared scores
r2_scores = []

# Degrees of polynomial to test
degrees = [1, 2, 3, 4, 5]

# Fit polynomial regression models of different degrees
for degree in degrees:
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly_train = poly.fit_transform(X_train)
    X_poly_test = poly.transform(X_test)

    # Fit the model
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

    # Make predictions
    y_pred = model.predict(X_poly_test)

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

    # Print the R-squared score for the current degree
    print(f"Degree: {degree}, R-squared: {r2:.4f}")

# Optional: Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(degrees, r2_scores, marker='o')
plt.title('R-squared Scores for Polynomial Regression Models')
plt.xlabel('Polynomial Degree')
plt.ylabel('R-squared Score')
plt.xticks(degrees)
plt.grid()
plt.show()


**Theoritical**

1. R-squared, also known as the coefficient of determination, is a statistical measure that represents the proportion of the variance for a dependent variable that is explained by one or more independent variables in a regression model.

2. Linear regression is a widely used statistical method for modeling the relationship between a dependent variable and one or more independent variables. However, for the results of a linear regression analysis to be valid, certain assumptions must be met. Here are the key assumptions of linear regression:

  1. Linearity.
  2. Independence.
  3. Homoscedasticity.
  4. Normality of Residuals.
  5. No Multicollinearity.
  6. No Autocorrelation.

3. R-squared and Adjusted R-squared are both statistical measures used to evaluate the goodness of fit of a regression model, but they serve slightly different purposes and have different interpretations. Here’s a breakdown of the differences between the two:

  1. Use R-squared when you want a simple measure of how well your model explains the variability in the dependent variable.
  
  2. Use Adjusted R-squared when you want to compare models with different numbers of predictors or when you want to avoid the pitfalls of overfitting.

4. Mean Squared Error (MSE) is a widely used metric for evaluating the performance of regression models. Here are several reasons why MSE is commonly used:

  1. Quantifies Prediction Error.
  2. Emphasizes Larger Errors.
  3. Differentiability.
  4. Interpretability.
  5. Widely Accepted.
  6. Useful for Model Comparison.
  

5. The model explains 85% of the variability in the data. The remaining 15% is likely due to factors not included in the model or random chance.

6. To check for normality of residuals in linear regression:

  1. Visual Inspection:

    Histogram: Plot a histogram of the residuals. If it resembles a bell curve, it suggests normality.
  
    Q-Q Plot: Create a Q-Q plot of the residuals. If the points fall approximately along a straight diagonal line, it indicates normality.

  2. Statistical Tests:

    Shapiro-Wilk Test: Perform the Shapiro-Wilk test using shapiro.test() in R or Python's scipy.stats. A p-value greater than your significance level (e.g., 0.05) suggests normality.


7. Multicollinearity occurs when two or more independent variables in a regression model are highly correlated with each other. This means they provide similar information about the dependent variable.

  Impact on Regression:

  1. Unstable Coefficients: Multicollinearity makes the estimated regression coefficients unstable and difficult to interpret. Small changes in the data can lead to large changes in the coefficients.

  2. Reduced Significance: It can inflate the standard errors of the coefficients, making it harder to detect statistically significant relationships between predictors and the outcome.

  3. Difficult Interpretation: Multicollinearity makes it challenging to isolate the individual effects of each predictor on the dependent variable.

8. **Mean Absolute Error (MAE)** is a metric used to measure the average absolute difference between predicted and actual values in a regression model. It provides a measure of the overall prediction accuracy, focusing on the magnitude of errors rather than their direction.

9. Here are the benefits of using an ML pipeline in short:

  Reproducibility: Ensures consistent and repeatable results.

  Maintainability: Makes the workflow easier to update and manage.

  Modularity: Allows for easy swapping of components and experimentation.

  Efficiency: Automates the process, saving time and resources.

  Scalability: Enables handling large datasets and complex models.

10. Here's why RMSE is considered more interpretable than MSE in short:

  Same Units: RMSE is in the same units as the target variable, making it easier to understand in the context of the problem.

  Relatable to Data: RMSE represents the average magnitude of errors, giving a more intuitive sense of the model's prediction accuracy.

11. Pickling is the process of converting a Python object into a byte stream (serialization) to store it on disk or transfer it over a network. This byte stream can later be converted back into the original Python object (deserialization).

  Usefulness in ML:

      Pickling is extremely useful in machine learning for saving trained models, allowing you to reuse them later without retraining. This saves time and computational resources. You can also use pickling to store intermediate data or complex objects for later use.

12. A high R-squared value indicates that the independent variables in a regression model explain a large proportion of the variance in the dependent variable.

13. Violating linear regression assumptions can lead to:

  1. Inaccurate Estimates: The estimated coefficients and predictions may be biased or inefficient.

  2. Invalid Inferences: Hypothesis tests and confidence intervals may be unreliable, leading to incorrect conclusions about the relationships between variables.

  3. Misleading Predictions: Predictions made by the model may be inaccurate or have larger errors.

  4. Reduced Model Performance: The overall performance of the model may be compromised, affecting its ability to generalize to new data.

14. Here's how to address multicollinearity in regression, in short:

  1. Remove Correlated Variables: Identify and remove one or more of the highly correlated independent variables.

  2. Combine Variables: Create a composite variable by combining the correlated variables (e.g., using principal component analysis).

  3. Regularization: Use regularization techniques like Ridge regression or Lasso regression to shrink the coefficients and reduce the impact of multicollinearity.

  4. Collect More Data: Increasing the sample size can sometimes help reduce the effects of multicollinearity.

15. Here's how feature selection can improve model performance in regression analysis, in short:

  1. Reduces Overfitting: By removing irrelevant features, the model focuses on the most important predictors, reducing the risk of overfitting to noise in the data.

  2. Improves Accuracy: Including only relevant features can lead to more accurate predictions, as the model is not distracted by irrelevant information.
  
  3. Simplifies Model: A simpler model with fewer features is easier to interpret and understand, making it more useful for insights and decision-making.

  4. Reduces Computational Cost: Training and prediction times are reduced with fewer features, making the model more efficient.

16. Adjusted R-squared is calculated using the following formula:

    Adjusted R-squared = 1 - [(1 - R-squared) * (n - 1) / (n - k - 1)]

  Where:

  R-squared: The regular R-squared value.

  n: The number of observations (data points).

  k: The number of predictors (independent variables).

17. MSE squares the errors, giving larger errors disproportionately more weight. Outliers, being extreme values, have large errors, and when squared, these errors become even larger, significantly inflating the MSE.

  Outliers have a big impact on MSE because it amplifies the effect of large errors through squaring.

18. Homoscedasticity means that the variance of the errors (residuals) is constant across all levels of the independent variables. This is a key assumption in linear regression.

  Importance:

  1. Valid inferences: Homoscedasticity ensures that the standard errors of the regression coefficients are unbiased, allowing for valid hypothesis testing and confidence intervals.

  2. Reliable predictions: It helps in producing reliable predictions across the range of independent variables.
  
  3. Efficient estimates: Homoscedasticity leads to efficient estimates of the regression coefficients, meaning they are more precise.

19. Root Mean Squared Error (RMSE) is a metric that measures the average magnitude of the errors (residuals) in a regression model. It's essentially the square root of the Mean Squared Error (MSE).

  RMSE = √(Σ(yi - ŷi)² / n)

  Where:

  yi: Actual value for the ith data point

  ŷi: Predicted value for the ith data point

  n: Total number of data points

20. Pickling can be risky due to potential security vulnerabilities. Unpickling untrusted data can lead to arbitrary code execution, allowing malicious actors to compromise your system.

21. Here are some alternatives to pickling for saving ML models in short:

  1. Joblib: Offers efficient serialization for NumPy arrays, making it suitable for scikit-learn models.

  2. ONNX: Creates a portable, open-source format for representing machine learning models, enabling interoperability between different frameworks.
  
  3. PMML: Provides a standardized XML-based language for describing and exchanging predictive models.
  
  4. Saving model weights directly: Deep learning frameworks often allow saving model weights as separate files (e.g., .h5 for Keras models).


22. Heteroscedasticity is when the variance of the errors (residuals) in a regression model is not constant across all levels of the independent variables. This violates a key assumption of linear regression, which assumes homoscedasticity (constant variance).

  Why is it a problem?

  Heteroscedasticity can lead to:

  1. Inefficient Estimates: The estimated regression coefficients become less precise and reliable.
  
  2. Invalid Inferences: Hypothesis tests and confidence intervals based on these estimates may be misleading.
  
  3. Biased Standard Errors: Standard errors are underestimated, potentially leading to incorrect conclusions about the significance of predictors.

23. Interaction terms allow the model to capture the combined effect of two or more predictors on the outcome variable. This can significantly improve prediction accuracy when the relationship between predictors and the outcome is not simply additive.