In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [176]:
df = pd.read_csv('nba2k-full.csv')
df.head()

Unnamed: 0,full_name,rating,jersey,team,position,b_day,height,weight,salary,country,draft_year,draft_round,draft_peak,college,version
0,LeBron James,97,#23,Los Angeles Lakers,F,12/30/84,6-9 / 2.06,250 lbs. / 113.4 kg.,$37436858,USA,2003,1,1,,NBA2k20
1,Kawhi Leonard,97,#2,Los Angeles Clippers,F,06/29/91,6-7 / 2.01,225 lbs. / 102.1 kg.,$32742000,USA,2011,1,15,San Diego State,NBA2k20
2,Giannis Antetokounmpo,96,#34,Milwaukee Bucks,F-G,12/06/94,6-11 / 2.11,242 lbs. / 109.8 kg.,$25842697,Greece,2013,1,15,,NBA2k20
3,Kevin Durant,96,#7,Brooklyn Nets,F,09/29/88,6-10 / 2.08,230 lbs. / 104.3 kg.,$37199000,USA,2007,1,2,Texas,NBA2k20
4,James Harden,96,#13,Houston Rockets,G,08/26/89,6-5 / 1.96,220 lbs. / 99.8 kg.,$38199000,USA,2009,1,3,Arizona State,NBA2k20


In [177]:
df.shape

(464, 15)

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464 entries, 0 to 463
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   full_name    464 non-null    object
 1   rating       464 non-null    int64 
 2   jersey       464 non-null    object
 3   team         441 non-null    object
 4   position     464 non-null    object
 5   b_day        464 non-null    object
 6   height       464 non-null    object
 7   weight       464 non-null    object
 8   salary       464 non-null    object
 9   country      464 non-null    object
 10  draft_year   464 non-null    int64 
 11  draft_round  464 non-null    object
 12  draft_peak   464 non-null    object
 13  college      388 non-null    object
 14  version      464 non-null    object
dtypes: int64(2), object(13)
memory usage: 54.5+ KB


In [179]:
df = df.dropna()

In [180]:
df.isnull().sum().sum()

np.int64(0)

In [181]:
df = df.drop(['full_name', 'b_day', 'height', 'weight', 'college'], axis = 1)
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak,version
1,97,#2,Los Angeles Clippers,F,$32742000,USA,2011,1,15,NBA2k20
3,96,#7,Brooklyn Nets,F,$37199000,USA,2007,1,2,NBA2k20
4,96,#13,Houston Rockets,G,$38199000,USA,2009,1,3,NBA2k20
5,95,#30,Golden State Warriors,G,$40231758,USA,2009,1,7,NBA2k20
6,94,#3,Los Angeles Lakers,F-C,$27093019,USA,2012,1,1,NBA2k20


In [182]:
def removehash(value):
    value = value[1:]
    return int(value)

df['jersey'] = df['jersey'].apply(removehash)

In [183]:
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak,version
1,97,2,Los Angeles Clippers,F,$32742000,USA,2011,1,15,NBA2k20
3,96,7,Brooklyn Nets,F,$37199000,USA,2007,1,2,NBA2k20
4,96,13,Houston Rockets,G,$38199000,USA,2009,1,3,NBA2k20
5,95,30,Golden State Warriors,G,$40231758,USA,2009,1,7,NBA2k20
6,94,3,Los Angeles Lakers,F-C,$27093019,USA,2012,1,1,NBA2k20


In [184]:
df['salary'] = df['salary'].apply(removehash)
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak,version
1,97,2,Los Angeles Clippers,F,32742000,USA,2011,1,15,NBA2k20
3,96,7,Brooklyn Nets,F,37199000,USA,2007,1,2,NBA2k20
4,96,13,Houston Rockets,G,38199000,USA,2009,1,3,NBA2k20
5,95,30,Golden State Warriors,G,40231758,USA,2009,1,7,NBA2k20
6,94,3,Los Angeles Lakers,F-C,27093019,USA,2012,1,1,NBA2k20


In [185]:
df['team'].value_counts()

team
Brooklyn Nets             17
Los Angeles Clippers      15
Los Angeles Lakers        15
Philadelphia 76ers        14
New Orleans Pelicans      14
Indiana Pacers            14
Milwaukee Bucks           14
Orlando Magic             14
New York Knicks           13
Chicago Bulls             13
Minnesota Timberwolves    13
Toronto Raptors           13
Houston Rockets           13
San Antonio Spurs         12
Memphis Grizzlies         12
Atlanta Hawks             12
Sacramento Kings          12
Denver Nuggets            12
Detroit Pistons           12
Miami Heat                12
Phoenix Suns              12
Golden State Warriors     11
Dallas Mavericks          11
Washington Wizards        11
Boston Celtics            11
Portland Trail Blazers    11
Oklahoma City Thunder     10
Cleveland Cavaliers       10
Utah Jazz                  9
Charlotte Hornets          7
Name: count, dtype: int64

In [186]:
df['country'].value_counts()

country
USA                   315
Canada                 15
Australia               8
Cameroon                5
Nigeria                 3
The Bahamas             2
Ukraine                 2
Finland                 1
Dominican Republic      1
Montenegro              1
Lithuania               1
New Zealand             1
Philippines             1
Austria                 1
Puerto Rico             1
Senegal                 1
Mali                    1
Israel                  1
Japan                   1
United Kingdom          1
Italy                   1
Germany                 1
Angola                  1
Haiti                   1
Egypt                   1
Greece                  1
Name: count, dtype: int64

In [187]:
def removecountryoutlier(value):
    if value not in ['USA', 'Canada', 'Australia']:
        return 'Others'
    else:
        return value

df['country'] = df['country'].apply(removecountryoutlier)
df['country'].value_counts()

country
USA          315
Others        31
Canada        15
Australia      8
Name: count, dtype: int64

In [188]:
df['position'].value_counts()

position
G      166
F      123
C       31
F-C     26
G-F     11
F-G      7
C-F      5
Name: count, dtype: int64

In [189]:
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak,version
1,97,2,Los Angeles Clippers,F,32742000,USA,2011,1,15,NBA2k20
3,96,7,Brooklyn Nets,F,37199000,USA,2007,1,2,NBA2k20
4,96,13,Houston Rockets,G,38199000,USA,2009,1,3,NBA2k20
5,95,30,Golden State Warriors,G,40231758,USA,2009,1,7,NBA2k20
6,94,3,Los Angeles Lakers,F-C,27093019,USA,2012,1,1,NBA2k20


In [190]:
df['draft_round'].unique

<bound method Series.unique of 1      1
3      1
4      1
5      1
6      1
      ..
457    1
458    1
459    1
460    1
462    1
Name: draft_round, Length: 369, dtype: object>

In [191]:
df['draft_peak'].unique

<bound method Series.unique of 1      15
3       2
4       3
5       7
6       1
       ..
457    27
458    13
459     4
460     3
462    24
Name: draft_peak, Length: 369, dtype: object>

In [192]:
def removeundrafted(value):
    if value=='Undrafted':
        return 
    else:
        return value
    
df['draft_round'] = df['draft_round'].apply(removeundrafted)
df['draft_peak'] = df['draft_peak'].apply(removeundrafted)

df = df.dropna()

In [193]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['position'] = le.fit_transform(df['position'])
df['country'] = le.fit_transform(df['country'])
df['team'] = le.fit_transform(df['team'])

df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak,version
1,97,2,12,2,32742000,3,2011,1,15,NBA2k20
3,96,7,2,2,37199000,3,2007,1,2,NBA2k20
4,96,13,10,5,38199000,3,2009,1,3,NBA2k20
5,95,30,9,5,40231758,3,2009,1,7,NBA2k20
6,94,3,13,3,27093019,3,2012,1,1,NBA2k20


In [194]:
df.drop('version', axis=1, inplace=True)

In [195]:
x = df.drop(['salary'], axis = 1)
y = df['salary']

In [196]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

In [197]:
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak
1,97,2,12,2,32742000,3,2011,1,15
3,96,7,2,2,37199000,3,2007,1,2
4,96,13,10,5,38199000,3,2009,1,3
5,95,30,9,5,40231758,3,2009,1,7
6,94,3,13,3,27093019,3,2012,1,1


In [198]:
lr = LinearRegression().fit(x_train, y_train)

In [199]:
import pickle
pickle.dump(lr, open('./model.sav', 'wb'))

In [200]:
df.head()

Unnamed: 0,rating,jersey,team,position,salary,country,draft_year,draft_round,draft_peak
1,97,2,12,2,32742000,3,2011,1,15
3,96,7,2,2,37199000,3,2007,1,2
4,96,13,10,5,38199000,3,2009,1,3
5,95,30,9,5,40231758,3,2009,1,7
6,94,3,13,3,27093019,3,2012,1,1
