In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. Create a sample dataset directly in code
data = {
    'Experience': [1, 3, 5, 7, 9, 2, 4, 6, 8, 10],
    'Education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters',
                  'Bachelors', 'Masters', 'PhD', 'Masters', 'PhD'],
    'Job Title': ['Data Analyst', 'Data Scientist', 'ML Engineer', 'Data Analyst', 'Data Scientist',
                  'Data Analyst', 'ML Engineer', 'Data Scientist', 'ML Engineer', 'ML Engineer'],
    'Salary': [40000, 60000, 85000, 50000, 70000, 42000, 72000, 88000, 89000, 95000]
}

df = pd.DataFrame(data)

# 2. Define features and target
X = df[['Experience', 'Education', 'Job Title']]
y = df['Salary']

# 3. Encode categorical variables
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'), ['Education', 'Job Title'])
    ],
    remainder='passthrough'
)

X_encoded = column_transformer.fit_transform(X)

# 4. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# 5. Train model
model = LinearRegression()
model.fit(X_train, y_train)

# 6. Predict
y_pred = model.predict(X_test)

# 7. Evaluate
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# 8. Predict for new employee
new_employee = pd.DataFrame({
    'Experience': [4],
    'Education': ['Masters'],
    'Job Title': ['Data Scientist']
})

new_encoded = column_transformer.transform(new_employee)
predicted_salary = model.predict(new_encoded)

print("Predicted Salary for New Employee: ${:.2f}".format(predicted_salary[0]))


Mean Squared Error: 111460229.3628806
R² Score: 0.4698681124238735
Predicted Salary for New Employee: $65901.05
