### Import Key Libraries

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import linear_model

%matplotlib inline

In [57]:
basketball_data = pd.read_csv('basketball.csv', names=['Height (ft)', 'Weight (lbs)', 'Successful Field Goals (%)', 'Successful Free Throws (%)', 'Average Points Per Game'])
basketball_data = basketball_data.tail(-1)
basketball_data.head()

Unnamed: 0,Height (ft),Weight (lbs),Successful Field Goals (%),Successful Free Throws (%),Average Points Per Game
1,6.8,225,0.442,0.672,9.2
2,6.3,180,0.435,0.797,11.7
3,6.4,190,0.456,0.761,15.8
4,6.2,180,0.416,0.651,8.6
5,6.9,205,0.449,0.9,23.2


In [58]:
basketball_data.dtypes

Height (ft)                   object
Weight (lbs)                  object
Successful Field Goals (%)    object
Successful Free Throws (%)    object
Average Points Per Game       object
dtype: object

In [88]:
basketball_data.info

<bound method DataFrame.info of     Height (ft)  Weight (lbs)  Successful Field Goals (%)  \
1           6.8           225                       0.442   
2           6.3           180                       0.435   
3           6.4           190                       0.456   
4           6.2           180                       0.416   
5           6.9           205                       0.449   
6           6.4           225                       0.431   
7           6.3           185                       0.487   
8           6.8           235                       0.469   
9           6.9           235                       0.435   
10          6.7           210                       0.480   
11          6.9           245                       0.516   
12          6.9           245                       0.493   
13          6.3           185                       0.374   
14          6.1           185                       0.424   
15          6.2           180                       0

### Data Preprocessing

In [59]:
#check for any null or NaN values
basketball_data.isnull().values.any()

False

In [60]:
#convert nonnumeric data to numeric data
for column in basketball_data:
    basketball_data[column] = pd.to_numeric(basketball_data[column])

In [61]:
basketball_data.dtypes

Height (ft)                   float64
Weight (lbs)                    int64
Successful Field Goals (%)    float64
Successful Free Throws (%)    float64
Average Points Per Game       float64
dtype: object

### Linear Regression

In [62]:
linreg = linear_model.LinearRegression()

In [63]:
#defining independent and dependent variables
x = basketball_data.drop(['Average Points Per Game'],axis=1).values
y = basketball_data['Average Points Per Game'].values

In [64]:
linreg.fit(x,y)

In [72]:
x.reshape(-1,1)
y.reshape(-1,1)

array([[ 9.2],
       [11.7],
       [15.8],
       [ 8.6],
       [23.2],
       [27.4],
       [ 9.3],
       [16. ],
       [ 4.7],
       [12.5],
       [20.1],
       [ 9.1],
       [ 8.1],
       [ 8.6],
       [20.3],
       [25. ],
       [19.2],
       [ 3.3],
       [11.2],
       [10.5],
       [10.1],
       [ 7.2],
       [13.6],
       [ 9. ],
       [24.6],
       [12.6],
       [ 5.6],
       [ 8.7],
       [ 7.7],
       [24.1],
       [11.7],
       [ 7.7],
       [ 9.6],
       [ 7.2],
       [12.3],
       [ 8.9],
       [13.6],
       [11.2],
       [ 2.8],
       [ 3.2],
       [ 9.4],
       [11.9],
       [15.4],
       [ 7.4],
       [18.9],
       [ 7.9],
       [12.2],
       [11. ],
       [ 2.8],
       [11.8],
       [17.1],
       [11.6],
       [ 5.8],
       [ 8.3]])

In [73]:
#slopes for each independent var
linreg.coef_

array([-3.69049908e+00,  9.45845788e-03,  4.79401992e+01,  1.13710193e+01])

In [74]:
#y intercept
linreg.intercept_

4.14870670628803

In [75]:
#predict ppg of player with 
linreg.predict([[6.4 , 190, .456, .761]])

array([12.84069605])

In [108]:
#split data into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.3,random_state=0)

In [109]:
linreg.fit(x_train,y_train)

In [110]:
#predict results!
y_pred = linreg.predict(x_test)

In [111]:
y_test

array([ 8.3,  7.2,  2.8,  5.6,  9.1, 15.8,  9.6, 15.4,  7.9, 11.7, 23.2,
       20.1,  7.7, 13.6,  7.7,  2.8, 11.2])

In [112]:
#evaluate model
r2_score(y_test,y_pred)

-0.19230013005292657

In [113]:
#check accuracy of model [0 - 1] (higher score => better model
linreg.score(x_test,y_test)

-0.19230013005292657