In [1]:
# Step 1: Required libraries import karo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: CSV file load karo
df = pd.read_csv("insurance.csv")
print(df.head())  # Dataset ka preview

# Step 3: Features aur target define karo
X = df[["age", "sex", "bmi", "children", "smoker", "region"]]  # Inputs
y = df["charges"]  # Target variable (insurance charges)

# Step 4: Categorical columns identify karo
categorical_cols = ["sex", "smoker", "region"]
numeric_cols = ["age", "bmi", "children"]

# Step 5: Preprocessing (categorical columns encode karo using OneHotEncoder)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'  # numeric columns unchanged rahenge
)

# Step 6: Pipeline banayein (preprocessing + model)
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

# Step 7: Train/Test split karo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Model ko train karo
pipeline.fit(X_train, y_train)

# Step 9: Predict karo test data par
y_pred = pipeline.predict(X_test)

# Step 10: Evaluate model performance
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R2 Score (Model ki accuracy jesa):", r2_score(y_test, y_pred))

# Step 11 (Optional): Kisi ek record ka actual vs predicted charges dekho
print("Actual Charges:", y_test.values[0])
print("Predicted Charges:", round(y_pred[0], 2))


   age     sex   bmi  children smoker     region   charges
0   56  female  23.9         3     no  southwest  11997.92
1   46  female  37.7         2    yes  southwest  22693.81
2   32    male  21.8         2    yes  southeast  24054.64
3   60    male  31.2         4     no  southwest  16500.45
4   25    male  15.0         2    yes  southwest  14915.91
Mean Squared Error (MSE): 6161106.570622658
R2 Score (Model ki accuracy jesa): 0.8059066628105519
Actual Charges: 6751.75
Predicted Charges: 11087.76
