# References
https://machinelearningmastery.com/elastic-net-regression-in-python/

# Elastic net Regression

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [102]:
rookiedata = pd.read_csv('rookie_data.csv')
rookiedata.head()

Unnamed: 0,index,Name,Hall of Fame Class,Year Drafted,GP,MIN,PTS,FGM,FGA,FG%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFF
0,0,Jeff Taylor,,1982.0,44.0,17.6,3.6,1.5,3.6,40.0,...,1.0,65.2,0.6,1.2,1.8,2.5,0.9,0.3,1.4,5.2
1,1,Charles Smith,,1988.0,71.0,30.4,16.3,6.1,12.4,49.5,...,5.5,72.5,2.4,4.1,6.5,1.5,1.0,1.3,2.1,16.7
2,2,Mark Davis,,1988.0,33.0,7.8,3.8,1.5,3.1,48.0,...,1.0,82.4,0.5,0.6,1.1,0.4,0.4,0.1,0.4,3.8
3,3,Charles Smith,,1989.0,60.0,8.7,2.9,1.0,2.2,44.4,...,1.3,69.7,0.2,0.9,1.2,1.7,0.6,0.1,0.6,4.1
4,4,Michael Smith,,1989.0,65.0,9.5,5.0,2.1,4.4,47.6,...,1.0,82.8,0.6,0.9,1.5,1.2,0.1,0.0,0.8,4.6


In [103]:
rookiedata = rookiedata.drop('index', axis=1)
rookiedata = rookiedata.drop('Hall of Fame Class',axis=1)
rookiedata = rookiedata.drop('Name',axis=1)

In [104]:
# Replace '-' with 0 in the 3P% column
rookiedata['3P%'] = rookiedata['3P%'].replace('-', 0)

# Convert the 3P% column to a float data type
rookiedata['3P%'] = rookiedata['3P%'].astype(float)

# Impute missing values in the dataset
imputer = SimpleImputer(strategy='mean')
rookiedata_imputed = pd.DataFrame(imputer.fit_transform(rookiedata), columns=rookiedata.columns)

In [105]:
target = 'EFF'
categorical_features = []
numeric_features = []
features = rookiedata.columns.values.tolist()
for col in features:
    if rookiedata[col].dtype != 'object': 
        if col != target:
            numeric_features.append(col)
    else:
        categorical_features.append(col)
        
for col in numeric_features:
    mean = rookiedata[col].mean()
    rookiedata[col] = rookiedata[col].fillna(mean)
    
for col in categorical_features:
    rookiedata[col] = rookiedata[col].fillna('None')

In [106]:
rookiedata['EFF'] = np.log1p(rookiedata['EFF'])

In [107]:
from scipy.stats import skew
skewed_feats = rookiedata[numeric_features].apply(lambda x: skew(x)) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
rookiedata[skewed_feats] = np.log1p(rookiedata[skewed_feats])

In [108]:
from sklearn.preprocessing import LabelEncoder
# Encoding categorical features
for col in categorical_features:
    le = LabelEncoder()
    le.fit(list(rookiedata[col].astype(str).values))
    rookiedata[col] = le.transform(list(rookiedata[col].astype(str).values))

In [109]:
y = rookiedata['EFF']
X = rookiedata.drop('EFF', axis=1)

In [114]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
max_iter = [1000, 10000]
l1_ratio = np.arange(0.0, 1.0, 0.1)
tol = [0.5]

elasticnet_gscv = GridSearchCV(estimator=ElasticNet(), 
                                param_grid={'alpha': alpha,
                                            'max_iter': max_iter,
                                            'l1_ratio': l1_ratio,
                                            'tol':tol},   
                                scoring='r2',
                                cv=5,
                              error_score='raise')

In [115]:
elasticnet_gscv.fit(X, y)
elasticnet_gscv.best_params_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').