# Simple Linear Regression


## Imports and load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import mlflow
import mlflow.sklearn
sns.set()
%matplotlib inline

In [2]:
data_ver = 2
model_ver = 2


In [None]:
mlflow.start_run() 
mlflow.log_param("data_ver", data_ver)
mlflow.log_param("model_ver", model_ver)

df = pd.read_csv("../data/SalaryData%s.csv" % data_ver)

In [None]:
df.head()

In [None]:
df.shape

Before continuing, check if there are any missing data in the data set.

In [None]:
df.isnull().values.any()

## Split data


In [None]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
df_copy = train_set.copy()

In [None]:
df_copy.shape

In [None]:
df_copy.head()

## Exploratory Data Analysis


In [None]:
df_copy.describe()

In [None]:
df_copy.corr()

In [None]:
df_copy.plot.scatter(x='YearsExperience', y='Salary')

In [None]:
# Regression plot
sns.regplot('YearsExperience', # Horizontal axis
           'Salary', # Vertical axis
           data=df_copy)

## Predict


In [None]:
test_set_full = test_set.copy()

test_set = test_set.drop(["Salary"], axis=1)

In [None]:
test_set.head()

In [None]:
train_labels = train_set["Salary"]

In [None]:
train_labels.head()

In [None]:
train_set_full = train_set.copy()

train_set = train_set.drop(["Salary"], axis=1)

In [None]:
train_set.head()

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(train_set, train_labels)

In [None]:
salary_pred = lin_reg.predict(test_set)

salary_pred

## Analyze Results

In [None]:
print("Coefficients: ", lin_reg.coef_)
print("Intercept: ", lin_reg.intercept_)

mlflow.log_metric("coef", float(lin_reg.coef_))
mlflow.log_metric("intercept", float(lin_reg.intercept_))

In [None]:
print(salary_pred)
print(test_set_full["Salary"])

In [None]:
lin_reg.score(test_set, test_set_full["Salary"])

In [None]:
score = r2_score(test_set_full["Salary"], salary_pred)
mlflow.log_metric("score", score)
print(score)


In [None]:
plt.scatter(test_set_full["YearsExperience"], test_set_full["Salary"], color='blue')
plt.plot(test_set_full["YearsExperience"], salary_pred, color='red', linewidth=2)

## Export model to disk and save into mlflow

In [None]:
from sklearn.externals import joblib

joblib.dump(lin_reg, "../models/linear_regression_model_v%s.pkl" % model_ver)
mlflow.sklearn.log_model(lin_reg, "model")