In [14]:
# 1) Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
print('Libraries imported')

Libraries imported


In [15]:
# 2) Load dataset (ensure `Salary_Data.csv` is in the same folder as this notebook)
try:
    df = pd.read_csv('Salary_Data.csv')
    print('Dataset loaded — rows:', len(df))
except FileNotFoundError:
    raise FileNotFoundError('Salary_Data.csv not found in the working directory')

# Quick peek at the data
display(df.head())
display(df.describe())

Dataset loaded — rows: 30


Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [16]:
# 3) Prepare features (X) and target (y)
# Assuming the last column is the target (Salary) and previous column(s) are features
X = df.iloc[:, :-1].values  # all columns except last as numpy array
y = df.iloc[:, -1].values   # last column as numpy array
print('X shape:', X.shape)
print('y shape:', y.shape)

X shape: (30, 1)
y shape: (30,)


In [31]:
# 4) Split into training and test sets
# We keep the same split as the script: test_size = 1/3, random_state = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)
print('Training samples:', X_train.shape[0])
print('Test samples:', X_test.shape[0])
X_test

Training samples: 20
Test samples: 10


array([[ 1.5],
       [10.3],
       [ 4.1],
       [ 3.9],
       [ 9.5],
       [ 8.7],
       [ 9.6],
       [ 4. ],
       [ 5.3],
       [ 7.9]])

In [36]:
# 5) Train a Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Model trained')
regressor.coef_, regressor.intercept_

#print(1.1 * regressor.coef_ + regressor.intercept_ )

Model trained


(array([9345.94244312]), np.float64(26816.19224403119))

In [33]:
# 6) Evaluate on the test set (R^2 score and simple predictions)
r2 = regressor.score(X_test, y_test)
print(f'R^2 on test set: {r2:.4f}')
# Show a few predicted vs actual values
y_pred = regressor.predict(X_test)
comparison = pd.DataFrame({'X_test': X_test.flatten(), 'y_true': y_test, 'y_pred': y_pred})
display(comparison)

R^2 on test set: 0.9749


Unnamed: 0,X_test,y_true,y_pred
0,1.5,37731.0,40835.105909
1,10.3,122391.0,123079.399408
2,4.1,57081.0,65134.556261
3,3.9,63218.0,63265.367772
4,9.5,116969.0,115602.645454
5,8.7,109431.0,108125.891499
6,9.6,112635.0,116537.239698
7,4.0,55794.0,64199.962017
8,5.3,83088.0,76349.687193
9,7.9,101302.0,100649.137545


In [None]:
# 7) Save the trained model to disk as `model.pkl`
with open('model.pkl', 'wb') as f:
    pickle.dump(regressor, f)
print('Model saved to model.pkl')

In [34]:
# 8) Optional: Use the saved model for a single prediction to verify
# Load the model back and predict on a sample (example: YearsExperience = 5)
with open('model.pkl', 'rb') as f:
    loaded = pickle.load(f)
sample = np.array([[5.1]])  # adjust shape to (1, n_features)
print('Sample prediction for 5 years experience ->', loaded.predict(sample)[0])

Sample prediction for 5 years experience -> 74480.49870395528


In [35]:
regressor.predict(sample)

array([74480.49870396])