In [87]:
# The idea behind this is that players in the madden video games have 0-99 overall ratings
# and a bunch of other 0-99 ratings for specific attributes like speed, awareness, throw power, etc.
# my idea behind this project is to use regression to get the overall rating from these more specific ratings

In [88]:
import pandas
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
from sklearn.preprocessing import StandardScaler

In [89]:
def validate(validation,params):
    top_weights = params[1:].to_numpy()
    top_features = params[1:].index.to_numpy()
    dict = {}
    for a, b in zip(top_features, top_weights):
        dict[a] = b
    #print(dict)
    array=[]
    for i in dict:
        array.append(i)
    #print(validation)
    features=validation[array].to_numpy()
    labels=validation[['overall_rating']].to_numpy()
    weights=params.to_numpy()
    #print(features)
    #print(weights[1:])
    errors=np.matmul(features,weights[1:])+weights[0]-labels.transpose()
    return np.average(np.absolute(errors))

In [90]:
def findPositions(data):
    c = data.to_numpy()
    q = c[:, 2]
    return (np.unique(q))

In [91]:
def doRegressionAndPCA(data,Position):    
    pos=data[data['Position']==Position] # filtering data for given position
    all_ratings = ['awareness_rating','throwPower_rating','kickReturn_rating','leadBlock_rating','strength_rating','bCVision_rating','catchInTraffic_rating','playAction_rating','pursuit_rating','mediumRouteRunning_rating','catching_rating','acceleration_rating','spinMove_rating','finesseMoves_rating','spectacularCatch_rating','runBlock_rating','tackle_rating','injury_rating','zoneCoverage_rating','deepRouteRunning_rating','trucking_rating','throwAccuracyShort_rating','jukeMove_rating','playRecognition_rating','shortRouteRunning_rating','breakSack_rating','speed_rating','runBlockPower_rating','jumping_rating','toughness_rating','throwOnTheRun_rating','manCoverage_rating','stiffArm_rating','powerMoves_rating','release_rating','hitPower_rating','throwAccuracyMid_rating','kickAccuracy_rating','passBlockPower_rating','impactBlocking_rating','stamina_rating','carrying_rating','breakTackle_rating','kickPower_rating','throwUnderPressure_rating','passBlock_rating','changeOfDirection_rating','press_rating','throwAccuracyDeep_rating','blockShedding_rating','runBlockFinesse_rating','agility_rating','passBlockFinesse_rating']
    features=all_ratings
    all_ratings.append('overall_rating')
    #print(features)
    pos_ratings=pos[all_ratings]
    training=pos_ratings.sample(frac=.9,random_state=1) # take 80% as training data
    test_rows=np.setdiff1d(pos_ratings.index,training.index)
    array=np.zeros(data.index.size)
    for n in range(data.index.size):
        array[n]=(n in test_rows)
        array=np.array(array,dtype=bool)

    test=data.loc[array]
    
    test=test[all_ratings]
    #print(validation)
    # makes the rest of the data the validation set
    
    # Separating out the features
    x = training.loc[:, features].values
    test_x=test.loc[:, features].values
    scaler=StandardScaler()
    scaler.fit(x)
    x = scaler.transform(x)
    test_x=scaler.transform(test_x)
    #print(x)
    from sklearn.decomposition import PCA
    pca = PCA(.95)
    pca.fit(x)
    principalComponents = pca.transform(x)
    principalComponents_test=pca.transform(test_x)
    #columns=np.zeros([pca.n_components_,1],dtype='<U22')
    columns=[]
    for i in range(pca.n_components_):
        columns.append('principal_component_'+str(i+1))
    #print(columns) 
    #print(principalComponents)
    principalDf = pandas.DataFrame(data = principalComponents
             , columns = columns)
    testDf = pandas.DataFrame(data = principalComponents_test
             , columns = columns)
    labelsdf = pandas.DataFrame(data = training[['overall_rating']].to_numpy()
             , columns = ['overall_rating'])
    labelsdf_test=pandas.DataFrame(data = test[['overall_rating']].to_numpy()
             , columns = ['overall_rating'])
    
    finalDf = pandas.concat([principalDf, labelsdf], axis = 1)
    finalDf_test=pandas.concat([testDf, labelsdf_test], axis = 1)
    #print(columns+'+')
    reg_string='overall_rating ~ '
    for i in columns:
        reg_string+=i+'+'
    reg_string=reg_string[:-1]
    y, X =dmatrices(reg_string,data=finalDf, return_type='dataframe')
    mod = sm.OLS(y, X)
    res = mod.fit()
    #print(validate(finalDf_test,res.params))
    #pca.explained_variance_ratio_
    print('Average absolute error for ' +Position + ' on test set: '+str(validate(finalDf_test,res.params)))
    return res.params

In [92]:
data=pandas.read_csv('Madden_23_Player_Ratings.csv') # Read data
allPositions = findPositions(data)

#to print everything do this
for pos in allPositions:
    print(doRegressionAndPCA(data, pos))
    print('')

#or just one, do this
#print(doRegression(data, 'QB'))

# change the argument here to do regression on whichever position

Average absolute error for C on test set: 0.5647576114293367
Intercept                 66.500000
principal_component_1     -1.510106
principal_component_2      2.577566
principal_component_3      0.496582
principal_component_4     -0.255283
principal_component_5      0.124463
principal_component_6      0.227060
principal_component_7     -0.253247
principal_component_8      0.220391
principal_component_9      0.337655
principal_component_10     0.236283
principal_component_11    -0.172107
principal_component_12    -0.073500
principal_component_13     0.420155
principal_component_14    -0.240272
principal_component_15    -0.371699
principal_component_16     0.264986
principal_component_17    -0.103529
principal_component_18    -0.439615
principal_component_19    -0.199664
principal_component_20    -0.422822
principal_component_21     0.665129
principal_component_22     0.474405
principal_component_23    -0.547353
principal_component_24    -0.030651
dtype: float64

Average absolute error 

Average absolute error for QB on test set: 0.6636504038320077
Intercept                 65.881720
principal_component_1      1.836454
principal_component_2     -2.401814
principal_component_3     -1.328103
principal_component_4      0.921819
principal_component_5     -0.965485
principal_component_6     -0.137243
principal_component_7     -0.561222
principal_component_8     -0.687613
principal_component_9      0.242886
principal_component_10    -0.278699
principal_component_11    -0.314556
principal_component_12    -0.181581
principal_component_13    -0.303714
principal_component_14     0.871599
principal_component_15     0.131098
principal_component_16     0.726977
principal_component_17     0.419282
principal_component_18    -0.524511
principal_component_19    -0.008312
principal_component_20     0.219509
principal_component_21    -0.076216
principal_component_22     0.367553
principal_component_23     0.079259
principal_component_24    -0.664617
dtype: float64

Average absolute error