In [150]:
import pandas as pd

# load the file
pymoli = "Resources/purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
pymoli_df = pd.read_csv(pymoli).dropna()
pymoli_df.head().style.hide_index()

Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,Ithergue48,24,Male,92,Final Critic,4.88
3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,Iskosia90,23,Male,131,Fury,1.44


In [151]:
# Create player demographics dataframe for later use
demographics = pymoli_df[["Gender", "SN", "Age"]].drop_duplicates()

# Create player_count variable for later use
player_count = pymoli_df["SN"].nunique()

total_players_df = pd.DataFrame({"Total Players": [player_count]}).style.hide_index()
total_players_df

Total Players
576


In [152]:
#unique values core each category
#note there are 780 purchase IDs but only 576 unique screen names(i.e. unique people that bought items)
pymoli_df.nunique()


Purchase ID    780
SN             576
Age             39
Gender           3
Item ID        179
Item Name      179
Price          145
dtype: int64

In [153]:
#Purchasing Analysis (Total)
unique_items = pymoli_df["Item ID"].nunique()
average_price = pymoli_df["Price"].mean()
number_purchases = pymoli_df["Purchase ID"].nunique()
total_revenue = pymoli_df["Price"].sum()


Values = pd.DataFrame({"Number of Unique Items": [unique_items], 
                       "Average Price": [average_price], 
                       "Number of Purchases": [number_purchases], 
                       "Total Revenue": [total_revenue]})

Values["Average Price"] = Values["Average Price"].map('${:,.2f}'.format)
Values["Total Revenue"] = Values["Total Revenue"].map('${:,.2f}'.format)

Values

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


In [154]:
#Gender data frame, gives you purchase count by gender
#this data frame has no duplicates 
gender = demographics["Gender"].value_counts()
gender

Male                     484
Female                    81
Other / Non-Disclosed     11
Name: Gender, dtype: int64

In [155]:
#Gender Demographics
percent_male_players = (484 / 576)*100
count_male_players = 484
percent_female_players = (81 / 576)*100
count_female_players = 81
percent_other_players = (11 / 576)*100
count_other_players = 11

#create gender values data frame
gender_values = pd.DataFrame(columns = ["Total Players", "Percentage of Players"], 
                             index = ["Male", "Female", "Other / Non-Disclosed"])

gender_values.loc["Male"] = [count_male_players, percent_male_players]
gender_values.loc["Female"] = [count_female_players, percent_female_players]
gender_values.loc["Other / Non-Disclosed"] = [count_other_players, percent_other_players]

gender_values["Percentage of Players"] = gender_values["Percentage of Players"].map('{:,.2f}%'.format)

gender_values

Unnamed: 0,Total Players,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


In [156]:
#Gender purchase count data frame, gives you purchase count by gender including duplicates
gender_count = pymoli_df["Gender"].value_counts()
gender_count

Male                     652
Female                   113
Other / Non-Disclosed     15
Name: Gender, dtype: int64

In [157]:
#groupby and perform calculations
gender_purchase_total = pymoli_df.groupby(["Gender"]).sum()["Price"].rename("Total Purchase Value")

gender_avg_price = pymoli_df.groupby(["Gender"]).mean()["Price"].rename("Average Purchase Price")

In [158]:
#calculations for summary table
purchase_count_male = 652
purchase_count_female = 113
purchase_count_other = 15
avg_price_male = gender_avg_price["Male"]
male_total_value = gender_purchase_total["Male"]
male_avg_total = male_total_value / count_male_players
avg_price_female = gender_avg_price["Female"]
female_total_value = gender_purchase_total["Female"]
female_avg_total = female_total_value / count_female_players
avg_price_other = gender_avg_price["Other / Non-Disclosed"]
other_total_value = gender_purchase_total["Other / Non-Disclosed"]
other_avg_total = other_total_value / count_other_players


In [163]:
# Purchasing Analysis (Gender)

purchase_summary = pd.DataFrame(columns = ["Purchase Count", "Avg Purchase Price", 
                                           "Total Purchase Value", "Avg Total Purchase per Person"], 
                             index = ["Male", "Female", "Other / Non-Disclosed"])

purchase_summary.loc["Male"] = [purchase_count_male, avg_price_male, male_total_value, male_avg_total]
purchase_summary.loc["Female"] = [purchase_count_female, avg_price_female, female_total_value, female_avg_total]
purchase_summary.loc["Other / Non-Disclosed"] = [purchase_count_other, avg_price_other, 
                                                 other_total_value, other_avg_total]

#format and map
purchase_summary["Avg Purchase Price"] = purchase_summary["Avg Purchase Price"].map('${:,.2f}'.format)
purchase_summary["Total Purchase Value"] = purchase_summary["Total Purchase Value"].map('${:,.2f}'.format)
purchase_summary["Avg Total Purchase per Person"] = purchase_summary["Avg Total Purchase per Person"].map('${:,.2f}'.format)

purchase_summary

Unnamed: 0,Purchase Count,Avg Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Male,652,$3.02,"$1,967.64",$4.07
Female,113,$3.20,$361.94,$4.47
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [177]:
#Age Demographics

# Create bins for ages
bins = [0, 10, 15, 20, 25, 30, 35, 40, 125]

# Create labels for these bins
group_labels = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34",
                "35-39", "40+"]
    
    
# Slice the data and place it into bins
pd.cut(pymoli_df["Age"], bins, labels=group_labels).head()

0    15-19
1    35-39
2    20-24
3    20-24
4    20-24
Name: Age, dtype: category
Categories (8, object): [<10 < 10-14 < 15-19 < 20-24 < 25-29 < 30-34 < 35-39 < 40+]

In [178]:
pymoli_df["Age Group"] = pd.cut(pymoli_df["Age"], bins, labels=group_labels)
pymoli_df.head().style.hide_index()

Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Group
0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,15-19
1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,35-39
2,Ithergue48,24,Male,92,Final Critic,4.88,20-24
3,Chamassasya86,24,Male,100,Blindscythe,3.27,20-24
4,Iskosia90,23,Male,131,Fury,1.44,20-24


In [201]:
# Create a GroupBy object based upon "group" and count number in each age group
age_total_count = pymoli_df.groupby(["Age Group"]).count()["Age"].rename("Total Count")
age_total_count

Age Group
<10       32
10-14     54
15-19    200
20-24    325
25-29     77
30-34     52
35-39     33
40+        7
Name: Total Count, dtype: int64

In [202]:
#DEBUG, NUMBERS ARE NOT CORRECT, SHOULD BE 17 PEOPLE <10


In [200]:
#Calculate Age Demographics totals and percentages:
under_ten_total = age_total_count["<10"]
age10to14_total = age_total_count["10-14"]
age15to19_total = age_total_count["15-19"]
age20to24_total = age_total_count["20-24"]
age25to29_total = age_total_count["25-29"]
age30to34_total = age_total_count["30-34"]
age35to39_total = age_total_count["35-39"]
age40_older_total = age_total_count["40+"]

percent_under_ten = (32 / 576)*100
percent_10to14 = (54 / 576)*100
percent_15to19 = (200 / 576)*100
percent_20to24 = (325 / 576)*100
percent_25to29 = (77 / 576)*100
percent_30to34 = (52 / 576)*100
percent_35to39 = (33 / 576)*100
percent_40_older = (7 / 576)*100

#create age demographic data frame
age_demographics = pd.DataFrame(columns = ["Total Count", "Percentage of Players"], 
                             index = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34",
                "35-39", "40+"])

age_demographics.loc["<10"] = [under_ten_total, percent_under_ten]
age_demographics.loc["10-14"] = [age10to14_total, percent_10to14]
age_demographics.loc["15-19"] = [age15to19_total, percent_15to19]
age_demographics.loc["20-24"] = [age20to24_total, percent_20to24]
age_demographics.loc["25-29"] = [age25to29_total, age25to29_total]
age_demographics.loc["30-34"] = [age30to34_total, percent_30to34]
age_demographics.loc["35-39"] = [age35to39_total, percent_35to39]
age_demographics.loc["40+"] = [age40_older_total, percent_40_older]

#format and map
age_demographics["Percentage of Players"] = age_demographics["Percentage of Players"].map('{:,.2f}%'.format)

age_demographics


Unnamed: 0,Total Count,Percentage of Players
<10,32,5.56%
10-14,54,9.38%
15-19,200,34.72%
20-24,325,56.42%
25-29,77,77.00%
30-34,52,9.03%
35-39,33,5.73%
40+,7,1.22%
