In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


df = pd.read_csv("machine_learning_data.csv")
df.head()

Unnamed: 0,Name,Address,Email,Phone Number,Job,Age,Income
0,Jennifer Bishop,"6505 Hood Village Apt. 052\nNew Emma, RI 48298",esmith@example.net,432-645-9548x29305,Theatre director,80,61641
1,Brittany Malone,"54666 Jason Ways\nLake Anthony, GA 56622",hkirby@example.org,765-589-8702,Dramatherapist,68,25690
2,David Marshall,"823 Tapia Landing Suite 049\nWest Laura, AL 21268",seanbanks@example.org,491.277.3936x88200,Television camera operator,48,106626
3,Timothy Alexander,"4538 Myers Curve Suite 214\nFowlerberg, DC 30914",yclark@example.org,(799)207-7556,"Scientist, audiological",62,31689
4,Stephanie Carpenter,"69641 Alexis Centers\nDuranside, MP 13875",melissa75@example.com,487.793.7733,Dispensing optician,88,117039


## Data Cleaning


In [6]:

# Drop unnecessary columns
df = df.drop(['Name', 'Address', 'Email', 'Phone Number'], axis=1)

# Encode categorical variable 'Job' using LabelEncoder
label_encoder = LabelEncoder()
df['Job'] = label_encoder.fit_transform(df['Job'])

In [7]:

# Assuming df is your DataFrame with the data
# If not, you need to load your data into df first

# Split data into features (X) and target (y)
X = df.drop('Income', axis=1)
y = df['Income']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# If you want to see the coefficients of the model
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print("\nModel Coefficients:")
print(coefficients)

# You can now use this model to make predictions on new data
# For example:
# new_data = scaler.transform(new_data)  # Remember to scale new data
# predictions = model.predict(new_data)

Mean Squared Error: 1400074127.07
Root Mean Squared Error: 37417.56
R-squared Score: -0.01

Model Coefficients:
  Feature  Coefficient
0     Job  -732.298673
1     Age   733.539298
