In [8]:
import pandas as pd
import numpy as np


np.random.seed(42)

n = 100

df = pd.DataFrame({
    "age": np.random.randint(22, 50, n),
    "experience": np.random.randint(0, 20, n),
    "salary": np.random.randint(30000, 150000, n),
    "education_level": np.random.choice([1, 2, 3], n)
})
df["career_stage"]=None
df.head()
df["career_stage"]=pd.cut(
    df["experience"],
    bins=[-1, 1, 5, float("inf")],
     labels=["junior", "mid", "senior"]
)

df=pd.get_dummies(df,columns=["career_stage"])

df_stage = pd.cut(
    df["experience"],
    bins=[-1, 1, 5, float("inf")],
    labels=["junior", "mid", "senior"]
)

salary_by_stage = df.groupby(df_stage)["salary"].mean()
print(salary_by_stage)



experience
junior     97206.166667
mid       105284.235294
senior     91155.619718
Name: salary, dtype: float64


  salary_by_stage = df.groupby(df_stage)["salary"].mean()


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

np.random.seed(42)
n = 200

df = pd.DataFrame({
    "age": np.random.randint(22, 50, n),
    "experience": np.random.randint(0, 20, n),
    "salary": np.random.randint(30000, 150000, n),
    "education_level": np.random.choice([1, 2, 3], n)
})

df["career_stage"] = pd.cut(
    df["experience"],
    bins=[-1, 1, 5, np.inf],
    labels=["junior", "mid", "senior"]
)

df = pd.get_dummies(df, columns=["career_stage"], drop_first=True)

features = [
    "age",
    "experience",
    "education_level",
    "career_stage_mid",
    "career_stage_senior"
]

X = df[features]
y = df["salary"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lin = LinearRegression()
lin.fit(X_train, y_train)
lin_pred = lin.predict(X_test)

lin_r2 = r2_score(y_test, lin_pred)
lin_mae = mean_absolute_error(y_test, lin_pred)

rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)

results = pd.DataFrame({
    "Model": ["LinearRegression", "RandomForest"],
    "R2": [lin_r2, rf_r2],
    "MAE": [lin_mae, rf_mae]
})

print(results)



              Model        R2           MAE
0  LinearRegression -0.025314  31449.759167
1      RandomForest -0.326954  34936.828154


In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

np.random.seed(42)
n = 200

df = pd.DataFrame({
    "age": np.random.randint(22, 50, n),
    "experience": np.random.randint(0, 20, n),
    "salary": np.random.randint(30000, 150000, n),
    "education_level": np.random.choice([1, 2, 3], n)
})

df["career_stage"] = pd.cut(
    df["experience"],
    bins=[-1, 1, 5, np.inf],
    labels=["junior", "mid", "senior"]
)

df = pd.get_dummies(df, columns=["career_stage"], drop_first=True)

q75 = df["salary"].quantile(0.75)
df["elite"] = (df["salary"] >= q75).astype(int)

features = ["age", "experience", "education_level", "career_stage_mid", "career_stage_senior"]
X = df[features]
y = df["elite"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pos_rate = y.mean()
print("Elite positive rate:", pos_rate)

log = LogisticRegression(max_iter=2000)
log.fit(X_train, y_train)
log_pred = log.predict(X_test)

rf = RandomForestClassifier(n_estimators=400, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

def metrics(y_true, y_pred, name):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    print("\n", name)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    print("Confusion Matrix:\n", cm)
    print("Report:\n", classification_report(y_true, y_pred, zero_division=0))

metrics(y_test, log_pred, "LogisticRegression")
metrics(y_test, rf_pred, "RandomForestClassifier")


Elite positive rate: 0.25

 LogisticRegression
Accuracy: 0.75
Precision: 0.0
Recall: 0.0
F1: 0.0
Confusion Matrix:
 [[30  0]
 [10  0]]
Report:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86        30
           1       0.00      0.00      0.00        10

    accuracy                           0.75        40
   macro avg       0.38      0.50      0.43        40
weighted avg       0.56      0.75      0.64        40


 RandomForestClassifier
Accuracy: 0.675
Precision: 0.3333333333333333
Recall: 0.3
F1: 0.3157894736842105
Confusion Matrix:
 [[24  6]
 [ 7  3]]
Report:
               precision    recall  f1-score   support

           0       0.77      0.80      0.79        30
           1       0.33      0.30      0.32        10

    accuracy                           0.68        40
   macro avg       0.55      0.55      0.55        40
weighted avg       0.66      0.68      0.67        40



In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(42)
n = 500

df = pd.DataFrame({
    "age": np.random.randint(22, 50, n),
    "experience": np.random.randint(0, 20, n),
    "salary": np.random.randint(30000, 150000, n),
    "education_level": np.random.choice([1, 2, 3], n)
})

df["career_stage"] = pd.cut(
    df["experience"],
    bins=[-1, 1, 5, np.inf],
    labels=["junior", "mid", "senior"]
)
df = pd.get_dummies(df, columns=["career_stage"], drop_first=True)

df["high_income"] = (df["salary"] > df["salary"].median()).astype(int)

features = ["age", "experience", "education_level", "career_stage_mid", "career_stage_senior"]
X = df[features].values
y = df["high_income"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy", keras.metrics.Precision(name="precision"), keras.metrics.Recall(name="recall")]
)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    verbose=1
)

test_metrics = model.evaluate(X_test, y_test, verbose=0)
print(dict(zip(model.metrics_names, test_metrics)))


Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 123ms/step - accuracy: 0.4826 - loss: 0.7021 - precision: 0.4849 - recall: 0.9766 - val_accuracy: 0.5000 - val_loss: 0.6968 - val_precision: 0.4923 - val_recall: 0.8205
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.5012 - loss: 0.6913 - precision: 0.4935 - recall: 0.7093 - val_accuracy: 0.4625 - val_loss: 0.6939 - val_precision: 0.4583 - val_recall: 0.5641
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5362 - loss: 0.6888 - precision: 0.5454 - recall: 0.5479 - val_accuracy: 0.5000 - val_loss: 0.6926 - val_precision: 0.4889 - val_recall: 0.5641
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5444 - loss: 0.6880 - precision: 0.5417 - recall: 0.5730 - val_accuracy: 0.5125 - val_loss: 0.6930 - val_precision: 0.5000 - val_recall: 0.5128
Epoch 5/30
[1m10/10[0m [

In [None]:
# If Linear fails but RandomForest and Neural Network perform well, the data is clearly non-linear and contains meaningful feature interactions and likely threshold-based regimes. This means the target cannot be modeled as a simple linear combination of inputs, and higher-capacity models are capturing hidden structure that linear models miss. Strong performance from both RF and NN also suggests the signal-to-noise ratio is decent, and the main limitation was model bias rather than data quality. From an ML engineering perspective, this indicates a structured, interaction-heavy tabular problem where feature relationships matter more than model choice, and the next step should be understanding interactions and feature structure (e.g., via feature engineering or interpretability tools) rather than blindly increasing model complexity.