The goal of this notebook is to correctly merge all dataframes together

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing Dataframes

## 2019 stats

In [12]:
season = pd.read_csv('data/season1920.csv')
adv = pd.read_csv('data/ad_1920.csv')
salaries = pd.read_csv('data/salaries.csv')
scrape538 = pd.read_csv('data/scrape538.csv')

In [13]:
season = season.drop(['Unnamed: 0', 'Unnamed: 29'], axis = 1)
adv = adv.drop(['Unnamed: 0', 'Pos', 'Age', 'Tm', 'G',
                       'MP'], axis = 1)
scrape538 = scrape538.drop(['Unnamed: 0', 'Position(s)'], axis = 1)

In [14]:
season = season.drop_duplicates(subset = 'Player', keep = 'first')
adv = adv.drop_duplicates(subset = 'Player', keep = 'first')
df = pd.merge(season, adv, on = 'Player').fillna(0)

In [15]:
merged_df = df.merge(salaries, how = 'outer', on = 'Player').merge(scrape538, how = 'outer', on = 'Player')

In [16]:
merged_df.shape

(586, 67)

In [17]:
problem = merged_df[merged_df.isna().any(axis=1)]

In [18]:
problem.shape

(215, 67)

export csv and manually clean it.

In [19]:
problem.to_csv('data/problem_draft.csv')

delete the rows that were in the problem csv and then apend the new problem csv to the dataframe 

In [20]:
merged_df.dropna(inplace = True)

In [46]:
fixed19 = pd.read_csv('data/2019rosterfinalfixed.csv')
fixed19 = fixed19.drop(['Team', 'Minutes'], axis = 1).fillna(0)
fixed19 = pd.concat([fixed19, merged_df], ignore_index = True, sort= False)
fixed19.to_csv('data/fixed19.csv')

In [47]:
fixed19.shape

(493, 67)

## 2018 BR Stats

In [29]:
season18 = pd.read_csv('data/season1819.csv', index_col = [0])
adv18 = pd.read_csv('data/ad_1819.csv', index_col = [0])

In [30]:
season18 = season18.drop(['Unnamed: 29', 'url_list'], axis = 1).fillna(0).drop_duplicates(subset = 'Player', keep = 'first')
adv18 = adv18.drop(['Pos', 'Age', 'Tm', 'G', 'MP'], axis =1).fillna(0).drop_duplicates(subset = 'Player', keep = 'first')

In [31]:
print(season18.shape)
print(adv18.shape)

(531, 30)
(531, 23)


In [32]:
stats18 = season18.merge(adv18, how = 'outer', on = 'Player')
stats18.shape

(531, 52)

## 2018 NBA Miner stats

In [25]:
misc = pd.read_csv('data/nbam_misc1.csv')
dist = pd.read_csv('data/shot_dist18.csv')
shot_type = pd.read_csv('data/shot_type18.csv')
zone = pd.read_csv('data/shot_zones18.csv')

In [26]:
miner = misc.merge(dist, how = 'outer', on = 'Player').merge(shot_type, how = 'outer', on = 'Player').merge(zone, how = 'outer', on = 'Player')

In [27]:
miner.columns

Index(['Player', 'Games18', 'FB', 'PAINTPTS', 'TO_PTS', '2ND_PTS', 'Q1PTS',
       'Q2PTS', 'Q3PTS', 'Q4PTS', 'PPP', '>8ft%', '8-16ft%', '16-24ft%',
       '24+%', 'BCS%', '>8FTU', '8-16FTU', '16-24FTU', '24+U', 'BCU',
       'AVGDISTA', 'AVGDISTM', 'AVGDISTF', 'AFGM', 'JS', 'LU', 'DUN', 'TIPIN',
       'JS%', 'Layup%18', 'DUN%', '2PT_m', '3PT_m', 'FT_m', 'AB_3U', 'MRU',
       'paint_nonra', 'ra_usage', 'RC_3', 'LC_3', 'backcourt_usage', 'AB3%',
       'MR%', 'paint_nonra%', 'RA%', 'RC3%', 'LC3%', 'BC%'],
      dtype='object')

## combine 2018 stats

In [33]:
cstats18 = stats18.merge(miner, how = 'outer', on = 'Player')
cstats18.shape

(595, 100)

In [34]:
cstats18problem = cstats18[cstats18.isna().any(axis = 1)]
cstats18.dropna(inplace = True)

In [35]:
#export to manually clean
cstats18problem.to_csv('data/stat18_probdraft.csv')

In [36]:
#import clean code
stat18_clean = pd.read_csv('data/stat18_prob.csv')

In [37]:
stat18_clean = pd.concat([stat18_clean, cstats18], sort= False)
stat18_clean.shape

(530, 100)

In [38]:
stat18_clean.columns = [i + '18' for i in stat18_clean.columns]
stat18_clean['Player'] = stat18_clean['Player18']

In [39]:
stat18_clean.columns

Index(['Player18', 'Pos18', 'Age18', 'Tm18', 'G18', 'GS18', 'MP18', 'FG18',
       'FGA18', 'FG%18',
       ...
       'AB3%18', 'MR%18', 'paint_nonra%18', 'RA%18', 'RC3%18', 'LC3%18',
       'BC%18', ' 18', ' .118', 'Player'],
      dtype='object', length=101)

In [40]:
#export df to csv
stat18_clean.to_csv('data/stat18_clean.csv')

## Putting it together - 2018 + 2019 stats

In [48]:
stat2year = fixed19.merge(stat18_clean, how = 'outer', on = 'Player').merge(miner, how = 'outer', on = 'Player')

In [49]:
stat2year.shape

(674, 215)

In [50]:
problem2year = stat2year[stat2year.isna().any(axis = 1)]
problem2year.to_csv('data/problem2year.csv')
stat2year.dropna(inplace = True)

In [54]:
fixed19 = pd.read_csv('data/combfinal2year.csv')
fixed19 = fixed19.fillna(0)
fixed19 = pd.concat([fixed19, merged_df], ignore_index = True, sort= False)
fixed19.to_csv('data/fixed19.csv')