# Rankings .csv cleaning

### Imports

In [341]:
import pandas as pd
import numpy as np

rankings = pd.read_csv("./datasets/rankings_1973-2017_csv.csv")

rankings.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,week_title,week_year,week_month,week_day,rank_text,rank_number,move_positions,move_direction,player_age,ranking_points,tourneys_played,player_url,player_slug,player_id
0,2017.11.20,2017,11,20,1,1,,,31.0,10645,18,/en/players/rafael-nadal/n409/overview,rafael-nadal,n409
1,2017.11.20,2017,11,20,2,2,,,36.0,9605,17,/en/players/roger-federer/f324/overview,roger-federer,f324
2,2017.11.20,2017,11,20,3,3,3.0,up,26.0,5150,23,/en/players/grigor-dimitrov/d875/overview,grigor-dimitrov,d875
3,2017.11.20,2017,11,20,4,4,1.0,down,20.0,4610,25,/en/players/alexander-zverev/z355/overview,alexander-zverev,z355
4,2017.11.20,2017,11,20,5,5,1.0,down,24.0,4015,27,/en/players/dominic-thiem/tb69/overview,dominic-thiem,tb69


### Cleaning

In [388]:
# filter by year 2017
rankings = rankings.loc[rankings["week_year"] == 2017]


# filter by first and last months of year where ranking was recorded
rankings = rankings.loc[(rankings["week_month"] == 11) | (rankings["week_month"] == 1)]


# check last day of month that rankings were recorded
# jan
end_jan = rankings.loc[rankings["week_month"] == 1, "week_day"].unique()[0]

# nov
end_nov = rankings.loc[rankings["week_month"] == 11, "week_day"].unique()[0]


# get rankings from beginning of year and end of year
november_filter = (rankings["week_month"] == 11) & (rankings["week_day"] == end_nov)
january_filter = (rankings["week_month"] == 1) & (rankings["week_day"] == end_jan)


# create dataframes for each
nov_rankings = rankings.loc[november_filter]
jan_rankings = rankings.loc[january_filter]


# get ranking difference between january and november
def ranking_diff(name):
    '''
    returns rank change for a player from January to November.
    Positive number: player has moved up n spots in ranking
    Negative number: player has moved down n spots in ranking
    '''
    nov_ranking = nov_rankings.loc[nov_rankings["player_slug"] == name, "rank_number"]
    jan_ranking = jan_rankings.loc[jan_rankings["player_slug"] == name, "rank_number"]
    rank_change = nov_ranking.values - jan_ranking.values
    if rank_change < 0:
        return abs(rank_change)
    elif rank_change > 0:
        return rank_change * -1
    else:
        return 0
    

# get ranking change for all players who ended year with a ranking
nov_rankings["rank_change"] = nov_rankings["player_slug"].map(ranking_diff)


# copy to new dataframe
p_rankings = nov_rankings.copy()


# drop irrelevant columns
drop_columns = ["week_year", "week_month", "week_day", "move_positions", "move_direction", "ranking_points",
                "tourneys_played", "player_url", "week_title", "rank_text"]

p_rankings.drop(columns = drop_columns, inplace = True)


# rename columns
p_rankings.rename(columns = {
    "player_slug": "player",
    "rank_number": "rank",
    "player_age": "age"
}, inplace = True)


# change rankings to integers
p_rankings["rank_change"] = p_rankings["rank_change"].astype(int)


p_rankings.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,rank,age,player,player_id,rank_change
0,1,31.0,rafael-nadal,n409,5
1,2,36.0,roger-federer,f324,8
2,3,26.0,grigor-dimitrov,d875,10
3,4,20.0,alexander-zverev,z355,18
4,5,24.0,dominic-thiem,tb69,3


In [391]:
p_rankings.to_csv("./datasets/rankings_2017.csv")