<a href="https://colab.research.google.com/github/abishekraja018/SDC-GENAI/blob/main/SYNTHETIC_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# ✅ Step 2: Generate synthetic data
np.random.seed(42)
num_samples = 300

square_feet = np.random.randint(800, 3000, size=num_samples)
bedrooms = np.random.randint(1, 6, size=num_samples)
house_age = np.random.randint(0, 50, size=num_samples)

# ✅ Define price category (target variable)
price_category = []

for sqft, beds, age in zip(square_feet, bedrooms, house_age):
    score = sqft * 0.3 + beds * 50 - age * 5
    if score < 600:
        price_category.append(0)  # Low
    elif score < 900:
        price_category.append(1)  # Medium
    else:
        price_category.append(2)  # High

# ✅ Step 3: Create DataFrame
df = pd.DataFrame({
    "SquareFeet": square_feet,
    "Bedrooms": bedrooms,
    "HouseAge": house_age,
    "PriceCategory": price_category
})

print(df.head())

# ✅ Step 4: Prepare data
X = df[["SquareFeet", "Bedrooms", "HouseAge"]]
y = df["PriceCategory"]

# ✅ Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# ✅ Step 6: Train logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=300)
model.fit(X_train, y_train)

# ✅ Step 7: Evaluate model
y_pred = model.predict(X_test)
print("\n✅ Classification Report:\n")
print(classification_report(y_test, y_pred))
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# ✅ Step 8: User input prediction
print("\n🏡 Predict house price category:")
user_sqft = float(input("Enter square feet: "))
user_beds = int(input("Enter number of bedrooms: "))
user_age = int(input("Enter house age: "))

user_input = np.array([[user_sqft, user_beds, user_age]])
pred_class = model.predict(user_input)[0]

labels = {0: "Low (< $300K)", 1: "Medium ($300K–$400K)", 2: "High (> $400K)"}
print(f"\n💰 Predicted Price Category: {labels[pred_class]}")

   SquareFeet  Bedrooms  HouseAge  PriceCategory
0        1660         3         4              1
1        2094         1        46              0
2        1930         2        24              0
3        1895         5         1              1
4        2438         2         9              1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



✅ Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           1       0.73      1.00      0.85        22
           2       1.00      0.12      0.22         8

    accuracy                           0.87        60
   macro avg       0.91      0.70      0.68        60
weighted avg       0.90      0.87      0.83        60

✅ Accuracy: 0.87

🏡 Predict house price category:
Enter square feet: 100
Enter number of bedrooms: 1
Enter house age: 1

💰 Predicted Price Category: Low (< $300K)


