In [1]:
# Import dependencies
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

# Generate some data
X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4, bias=100.0)

# Create a linear model
model = LinearRegression()

# Fit (Train) our model to the data
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [2]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = model.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

#A "good" MSE score will be close to zero while a "good" [R2 Score]will be close to 1.

Mean Squared Error (MSE): 11.933040779746149
R-squared (R2 ): 0.903603363418708


In [3]:
# Overall Score for the model
model.score(X, y)

0.903603363418708

In [None]:
from sklearn.model_selection import train_test_split

# default is 75% for train, 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
df = pd.read_csv('jason_garmin_details.csv')
df.apply(pd.to_numeric, errors='ignore')
df.dtypes

Unnamed: 0          int64
 Activity Name     object
Start              object
Distance          float64
Time               object
Elevation Loss      int64
Avg Pace          float64
Elevation Gain      int64
Steps               int64
Avg Temp          float64
Net Elevation       int64
mph               float64
dtype: object

In [4]:
df.corr()

Unnamed: 0.1,Unnamed: 0,Distance,Elevation Loss,Avg Pace,Elevation Gain,Steps,Avg Temp,Net Elevation,mph
Unnamed: 0,1.0,-0.401883,-0.321723,0.020851,-0.160307,-0.497325,0.26176,0.1611,0.092245
Distance,-0.401883,1.0,0.284709,-0.112556,0.244497,0.944697,-0.208662,0.000236,0.031258
Elevation Loss,-0.321723,0.284709,1.0,0.141369,0.710765,0.277423,-0.002404,-0.20509,-0.12633
Avg Pace,0.020851,-0.112556,0.141369,1.0,0.061543,-0.076509,0.274872,-0.08317,-0.886724
Elevation Gain,-0.160307,0.244497,0.710765,0.061543,1.0,0.224145,0.110781,0.542707,0.004265
Steps,-0.497325,0.944697,0.277423,-0.076509,0.224145,1.0,-0.249909,-0.019381,-0.022287
Avg Temp,0.26176,-0.208662,-0.002404,0.274872,0.110781,-0.249909,1.0,0.157009,-0.217815
Net Elevation,0.1611,0.000236,-0.20509,-0.08317,0.542707,-0.019381,0.157009,1.0,0.156777
mph,0.092245,0.031258,-0.12633,-0.886724,0.004265,-0.022287,-0.217815,0.156777,1.0


In [7]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df,test_size=0.2,random_state=42)

In [12]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
X = train_set[['Distance','Avg Temp','Net Elevation']]
y = train_set['mph']
lin_reg.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
predictions = lin_reg.predict(test_set[['Distance','Avg Temp','Net Elevation']])
predictions - test_set['mph']

0    -0.314769
5    -0.871913
46    3.890521
31   -0.930099
13    2.098432
55   -0.294698
34    3.906284
49    3.640183
12    0.632404
40   -2.750280
33    2.177590
59    1.003792
16    1.911303
Name: mph, dtype: float64

In [14]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(predictions,test_set['mph'])
rms = np.sqrt(mse)
rms

2.274056906173203

In [20]:
from scipy.stats import pearsonr
pearsonr(predictions, test_set['mph'])

(0.141045087254867, 0.645793710047652)

In [15]:
# model coefficients to evaluate impact of each of the features tested
lin_reg.intercept_,lin_reg.coef_

(29.916701404543545, array([-0.02627161, -0.31214735,  0.0023147 ]))

In [16]:
#shuffle the data to perform cross validation
# to do the shuffling use numpy permutation function to make randomly permuted list of indicies, 
# pass that list into pandas iloc function to shuffle rows of original dataset

from sklearn.model_selection import cross_val_score
shuffled_indices = np.random.permutation(len(df))
shuffled_data = df.iloc[shuffled_indices]
full_X = shuffled_data[['Distance','Steps','Avg Temp','Net Elevation']]
full_Y = shuffled_data['mph']
scores = cross_val_score(lin_reg,full_X,full_Y,scoring='neg_mean_squared_error',cv=5)
rms_scores = np.sqrt(-scores)
rms_scores

array([ 4.18634455,  6.48619687, 15.26569972,  4.83826646,  2.0457644 ])

In [17]:
# tells how accurately the model can predict pace
rms_scores.mean()

6.564454401068753