# Python Script for NBA Players data
### Alexander Williamson and Brenton Wilder
### Fall Semester 2020

In [128]:
# Load the dataset directly from repository
import pandas as pd

url = 'https://github.com/bwilder95/NBAplayers/blob/main/Players.csv?raw=true'
df = pd.read_csv(url,sep=',')
print(df.head())

   Unnamed: 0           Player  height  weight  \
0           0  Curly Armstrong   180.0    77.0   
1           1     Cliff Barker   188.0    83.0   
2           2    Leo Barnhorst   193.0    86.0   
3           3       Ed Bartels   196.0    88.0   
4           4      Ralph Beard   178.0    79.0   

                           collage    born   birth_city birth_state  
0               Indiana University  1918.0          NaN         NaN  
1           University of Kentucky  1921.0     Yorktown     Indiana  
2         University of Notre Dame  1924.0          NaN         NaN  
3  North Carolina State University  1925.0          NaN         NaN  
4           University of Kentucky  1927.0  Hardinsburg    Kentucky  


In [129]:
# Confirm shape of dataset is correct (3922 rows and 8 columns)
"""
Column 1 (index 0) is unamed
Column 2 (index 1) is Player
Column 3 (index 2) is collage
Column 4 (index 3) is height
Column 5 (index 4) is weight
Column 6 (index 5) is born
Column 7 (index 6) is birth_city
Column 8 (index 7) is birth_state
"""
df.shape

(3922, 8)

## Task 0: Explore names of NBA players

In [130]:
# Find Kawhi Leonard in our dataframe using a boolean mask
df[df['Player'].str.contains('Kawhi', case=False, na=False)]

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
3491,3491,Kawhi Leonard,201.0,104.0,San Diego State University,1991.0,Los Angeles,California


In [131]:
# Modify pandas column header to fix collage to college. Also, replace blank column with ID.
df.rename(columns={'Unnamed: 0': 'ID', 'collage': 'College'}, inplace=True)

In [132]:
# Find number of NBA players from San Diego State University
df[df['College'].str.contains('San Diego', case=False, na=False)]


Unnamed: 0,ID,Player,height,weight,College,born,birth_city,birth_state
1064,1064,Stan Washington,193.0,86.0,University of San Diego,1952.0,Washington,District of Columbia
1314,1314,Joel Kramer,201.0,92.0,San Diego State University,1955.0,San Diego,California
1376,1376,Steve Malovic,208.0,104.0,San Diego State University,1956.0,Cleveland,Ohio
1657,1657,Michael Cage,206.0,101.0,San Diego State University,1962.0,West Memphis,Arkansas
3077,3077,Randy Holcomb,206.0,102.0,San Diego State University,1979.0,Chicago,Illinois
3491,3491,Kawhi Leonard,201.0,104.0,San Diego State University,1991.0,Los Angeles,California
3517,3517,Malcolm Thomas,206.0,102.0,San Diego State University,1988.0,Columbia,Missouri
3635,3635,Jamaal Franklin,196.0,86.0,San Diego State University,1991.0,Moreno Valley,California


In [133]:
print("Number of NBA players from SDSU = " + str(len(df[df['College'].str.contains('San Diego', case=False, na=False)])))

Number of NBA players from SDSU = 8


### What is the most common first name for all NBA players? What is the most common last name?

In [134]:
# To do this, we need to first split the Player column into two seperate columns.

df_names = df.copy()
df_names[['First','Last']] = df["Player"].str.split(" ", 1, expand=True)
print(df_names.head())

   ID           Player  height  weight                          College  \
0   0  Curly Armstrong   180.0    77.0               Indiana University   
1   1     Cliff Barker   188.0    83.0           University of Kentucky   
2   2    Leo Barnhorst   193.0    86.0         University of Notre Dame   
3   3       Ed Bartels   196.0    88.0  North Carolina State University   
4   4      Ralph Beard   178.0    79.0           University of Kentucky   

     born   birth_city birth_state  First       Last  
0  1918.0          NaN         NaN  Curly  Armstrong  
1  1921.0     Yorktown     Indiana  Cliff     Barker  
2  1924.0          NaN         NaN    Leo  Barnhorst  
3  1925.0          NaN         NaN     Ed    Bartels  
4  1927.0  Hardinsburg    Kentucky  Ralph      Beard  


### Next let's use mode function to find most frequent name!

In [135]:
print("Most frequent first name =  " + df_names.First.mode())
print("Most frequent last name =  " + df_names.Last.mode())

0    Most frequent first name =  John
dtype: object
0    Most frequent last name =  Williams
dtype: object


### For fun, let's see how many "John" and how many "Williams" we have in our dataset

In [136]:
df[df['Player'].str.match('John ', case=False, na=False)]

Unnamed: 0,ID,Player,height,weight,College,born,birth_city,birth_state
27,27,John Chaney,190.0,83.0,Louisiana State University,1920.0,,
75,75,John Hargis,188.0,81.0,University of Texas at Austin,1920.0,Nacogdoches,Texas
112,112,John Logan,188.0,79.0,Indiana University,1921.0,Richmond,Indiana
116,116,John Mahnken,203.0,99.0,Georgetown University,1922.0,New Jersey,New Jersey
117,117,John Mandic,193.0,92.0,Oregon State University,1919.0,Los Angeles,California
...,...,...,...,...,...,...,...,...
2891,2891,John Salmons,201.0,95.0,University of Miami,1979.0,Philadelphia,Pennsylvania
2993,2993,John Edwards,213.0,124.0,Kent State University,1981.0,Warren,Ohio
3449,3449,John Wall,193.0,88.0,University of Kentucky,1990.0,Raleigh,North Carolina
3554,3554,John Henson,211.0,103.0,University of North Carolina,1990.0,Greensboro,North Carolina


In [137]:
df[df['Player'].str.contains(' Williams', case=False, na=False)]

Unnamed: 0,ID,Player,height,weight,College,born,birth_city,birth_state
417,417,Bob Williams,198.0,104.0,Florida Agricultural and Mechanical University,1931.0,,
744,744,Art Williams,185.0,81.0,"California State Polytechnic University, Pomona",1939.0,Bonham,Texas
786,786,Cliff Williams,190.0,81.0,Bowling Green State University,1945.0,,
787,787,Ron Williams,190.0,85.0,West Virginia University,1944.0,Weirton,West Virginia
788,788,Sam Williams,190.0,81.0,University of Iowa,1945.0,,
...,...,...,...,...,...,...,...,...
3527,3527,Derrick Williams,190.0,90.0,University of Illinois at Urbana-Champaign,1984.0,Parkersburg,West Virginia
3528,3528,Elliot Williams,196.0,86.0,University of Memphis,1989.0,Memphis,Tennessee
3529,3529,Jordan Williams,188.0,83.0,New Mexico State University,1951.0,New Haven,Connecticut
3828,3828,Alan Williams,198.0,90.0,Drake University,1948.0,,


## Task 1: Create a word cloud in the shape of the NBA logo (https://www.datacamp.com/community/tutorials/wordcloud-python)

In [138]:
# Use kobe_bryant.png by loading raw image from github. This will be the silhouette of word cloud.




## Task 2: Use geocoding within geopandas to plot locations to a world map (https://geopandas.org/geocoding.html)

In [139]:
# using geopandas and geolocations

## Task 3: Analyze player height (cm) and player weight (kg). Are players taller than they were in the past?

In [140]:
# Task 3A: Create violin plots of the two columns.



In [141]:
# Task 3B: Is there any relationship between the year player was born and player height? (are NBA players getting taller)
# Group players by year born and create boxplots, with years on x-axis and height on y-axis.
