In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- Load Datasets ---
original_df = pd.read_csv("/content/drive/MyDrive/Intership task/CareerMap- Mapping Tech Roles With Personality & Skills.csv")
synthetic_df = pd.read_csv("/content/drive/MyDrive/Intership task/synthetic_dataset.csv")

# --- Prepare Original Data ---
X_original = original_df.drop('Role', axis=1)
y_original = original_df['Role']

X_train, X_test, y_train, y_test = train_test_split(
    X_original, y_original, test_size=0.2, random_state=42
)

# --- Prepare Synthetic Data ---
X_synthetic = synthetic_df.drop('Role', axis=1)
y_synthetic = synthetic_df['Role']

# --- 🔧 Add Missing Columns to Synthetic Data (with 0 as default) ---
for col in X_train.columns:
    if col not in X_synthetic.columns:
        X_synthetic[col] = 0  # or use np.nan and then impute if needed

# --- Ensure same column order ---
X_synthetic = X_synthetic[X_train.columns]

# --- Train Logistic Regression Model ---
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)

# --- Evaluate on Original Test Data ---
y_pred_original = logreg_model.predict(X_test)
accuracy_original = accuracy_score(y_test, y_pred_original)
print(f"Accuracy on original test data: {accuracy_original:.2f}")

# --- Evaluate on Synthetic Data ---
y_pred_synthetic = logreg_model.predict(X_synthetic)
accuracy_synthetic = accuracy_score(y_synthetic, y_pred_synthetic)
print(f"Accuracy on synthetic data: {accuracy_synthetic:.2f}")

# --- Overfitting Analysis ---
if accuracy_original > 0.95 and accuracy_synthetic < 0.8:
    print("\n⚠️ Model likely overfitting: High accuracy on original data but low on synthetic data.")
else:
    print("\n✅ Model generalizes better: Performance is more balanced across datasets.")


Accuracy on original test data: 1.00
Accuracy on synthetic data: 1.00

✅ Model generalizes better: Performance is more balanced across datasets.
