<a href="https://colab.research.google.com/github/ashishdeshp4nde/DSA-/blob/main/DAITutorial2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from statsmodels.stats.diagnostic import linear_rainbow
import statsmodels.api as sm

# Load dataset
df = pd.read_csv("/content/tips (data for regression problem).csv")
print(df.head())

# Data Preprocessing
df['is_male'] = df['sex'].apply(lambda x: 0 if x == 'Male' else 1)
df['smoking_status'] = df['smoker'].apply(lambda x: 1 if x == 'Yes' else 0)
df['day_label'] = df['day'].astype('category').cat.codes
df['meal_period'] = df['time'].map({'Lunch': 0, 'Dinner': 1})
df.drop(['sex', 'smoker', 'day', 'time'], axis=1, inplace=True)

# Pair plot and heatmap with modified color scheme
sns.pairplot(df)
plt.suptitle("Feature Pair Plot", y=1.02)
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

# Linearity Check: Rainbow Test
features = df[['total_bill', 'size', 'is_male', 'smoking_status', 'day_label', 'meal_period']]
target = df['tip']
features_with_const = sm.add_constant(features)
model = sm.OLS(target, features_with_const).fit()
rainbow_stat, rainbow_p_val = linear_rainbow(model)
print(f"Rainbow Test Statistic: {rainbow_stat:.4f}, p-value: {rainbow_p_val:.4f}")

# Residual Plot with updated line color
plt.figure(figsize=(8, 6))
sns.residplot(x=model.fittedvalues, y=model.resid, lowess=True, line_kws={'color': 'blue'})
plt.title("Residuals vs. Fitted Values")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.axhline(0, color='gray', linestyle='--')
plt.show()

# Split data into training and test sets
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(features, target, test_size=0.2, random_state=42)

# Define and fit models
model_dict = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=0),
    "SVR": SVR(kernel='rbf'),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5)
}

# Store model evaluation metrics
evaluation_results = []
for model_name, model in model_dict.items():
    model.fit(X_train_data, y_train_data)
    predictions = model.predict(X_test_data)
    r2 = r2_score(y_test_data, predictions)
    mae = mean_absolute_error(y_test_data, predictions)
    mse = mean_squared_error(y_test_data, predictions)
    evaluation_results.append({
        "Model": model_name,
        "R² Score": r2,
        "MAE": mae,
        "MSE": mse
    })

# Display results
results_frame = pd.DataFrame(evaluation_results).sort_values(by="R² Score", ascending=False)
print(results_frame)

# Identify the best model based on R² Score
top_model = results_frame.iloc[0]["Model"]
print(f"\nThe model with the highest R² Score is: {top_model}")
