In [32]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [33]:
active_team = pd.read_csv('active team top 100')
home_team = pd.read_csv('home team top 100')
away_team = pd.read_csv('away team top 100')
raptor_player = pd.read_csv('raptor player merge top 100')
player_team = pd.read_csv('player team merge top 100')
summary_home = pd.read_csv('home summary top 100')
summary_away = pd.read_csv('away summary top 100')
summary = pd.read_csv('summary top 100')
corr_summary = pd.read_csv('correlated summary top 100')
games_40 = pd.read_csv('40 games plus')


In [34]:
len(corr_summary) * .7, len(corr_summary) * .3

(3791.8999999999996, 1625.1)

In [35]:
len(summary) * .7, len(summary) * .3

(3791.8999999999996, 1625.1)

In [36]:
summary.columns

Index(['Unnamed: 0', 'Game ID', 'Home ID', 'Short Home', 'Home Points',
       'Difference', 'Away ID', 'Short Away', 'Away Points', 'year', 'Total',
       'home_pct', 'away_pct', 'Home Team', 'Home Player ID', 'Home Player',
       'Home Points.1', 'Home Assists', 'Home Rebounds', 'Home Minutes',
       'Home Raptor', 'Home WAR', 'Away Team', 'Away Player ID', 'Away Player',
       'Away Points.1', 'Away Assists', 'Away Rebounds', 'Away Minutes',
       'Away Raptor', 'Away WAR'],
      dtype='object')

In [37]:
X_train, X_test, y_train, y_test = train_test_split(corr_summary.drop(columns='Difference'), 
                                                    corr_summary.Difference, test_size=0.3, 
                                                    random_state=42)

In [38]:
X_test.dtypes

Unnamed: 0         int64
Home Points        int64
Away Points        int64
year               int64
Total              int64
home_pct         float64
away_pct         float64
Home Points.1    float64
Home Assists     float64
Home Rebounds    float64
Home Minutes     float64
Home Raptor      float64
Home WAR         float64
Away Points.1    float64
Away Assists     float64
Away Rebounds    float64
Away Minutes     float64
Away Raptor      float64
Away WAR         float64
dtype: object

In [39]:
index_old = ['Unnamed: 0', 'year']
index_train = X_train[index_old]
index_test = X_test[index_old]
X_train.drop(columns=index_train, inplace=True)
X_test.drop(columns=index_test, inplace=True)

In [40]:
X_test.dtypes

Home Points        int64
Away Points        int64
Total              int64
home_pct         float64
away_pct         float64
Home Points.1    float64
Home Assists     float64
Home Rebounds    float64
Home Minutes     float64
Home Raptor      float64
Home WAR         float64
Away Points.1    float64
Away Assists     float64
Away Rebounds    float64
Away Minutes     float64
Away Raptor      float64
Away WAR         float64
dtype: object

In [41]:
train_mean = y_train.mean()
train_mean

0.9612239514639936

In [42]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[0.96122395]])

In [43]:
#Calculate the R^2 as defined above
def r_squared(y, ypred):
    """R-squared score.
    
    Calculate the R-squared, or coefficient of determination, of the input.
    
    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)
    sum_sq_tot = np.sum((y - ybar)**2) #total sum of squares error
    sum_sq_res = np.sum((y - ypred)**2) #residual sum of squares error
    R2 = 1.0 - sum_sq_res / sum_sq_tot
    return R2

In [44]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([0.96122395, 0.96122395, 0.96122395, 0.96122395, 0.96122395])

In [45]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([0.96122395, 0.96122395, 0.96122395, 0.96122395, 0.96122395])

In [46]:
y_te_pred = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

-0.0005618570740244522

In [47]:
#Code task 7#
#Calculate the MAE as defined above
def mae(y, ypred):
    """Mean absolute error.
    
    Calculate the mean absolute error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [48]:
mae(y_train, y_tr_pred)

12.222871075415673

In [49]:
mae(y_test, y_te_pred)

11.766757092524736

In [50]:
X_defaults_median = X_train.median()
X_defaults_median

Home Points       113.000000
Away Points       112.000000
Total             226.000000
home_pct            0.500000
away_pct            0.509091
Home Points.1      12.800000
Home Assists        2.900000
Home Rebounds       4.700000
Home Minutes     1898.000000
Home Raptor         2.458975
Home WAR            4.416973
Away Points.1      12.800000
Away Assists        2.900000
Away Rebounds       4.700000
Away Minutes     1908.000000
Away Raptor         2.457199
Away WAR            4.416973
dtype: float64

In [52]:
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

#Call the StandardScaler`s fit method on `X_tr` to fit the scaler
#then use it's `transform()` method to apply the scaling to both the train and test split
#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

In [53]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [54]:
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)