In [182]:
#Import dependencies
import pandas as pd

#Read in the csv file
datapath = "Resources/purchase_data.csv"
rawdata = pd.read_csv(datapath)



### purchase_data.CSV Data Sample

In [183]:
#Display data sample
rawdata.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


# Player Count
This cell uses the .agg() function to return the number of unique values from a single column of the data set

In [184]:
playercount = (
    rawdata.agg({'SN':'nunique'}) # Select columns from the data set, and apply aggregate functions
    .to_frame().rename({'SN': 'Total Players'}).T # Improve the format of the output
)
display(playercount)

Unnamed: 0,Total Players
0,576


# Purchasing Analysis (Total)

In [209]:
# Formatting looks great, (Purchasing Analysis (Total)), consider redoing this block using a list of values to return on price, and renaming, then dropping multiindex level 0
purchasing_analysis = rawdata.copy()
purchasing_analysis["Total Revenue"] = purchasing_analysis["Price"]
purchasing_analysis = (
    #Select columns from the data set, and apply aggregate functions
    purchasing_analysis.agg({"Purchase ID":'count',"Price":'mean',"Item ID":'nunique',"Total Revenue":'sum'})
    #improve the format of the output
    .rename({"Purchase ID":"Number of Purchases","Price":"Average Price","Item ID":"Number of Unique Items"})
    .to_frame().T
)

(purchasing_analysis.head().
 style.format({"Number of Purchases":int,"Average Price":"${:.2f}","Number of Unique Items":int,"Total Revenue":'${:.2f}'})
)

Unnamed: 0,Number of Purchases,Average Price,Number of Unique Items,Total Revenue
0,780,$3.05,179,$2379.77


# Gender Demographics

In [211]:
genderdemos = rawdata.copy().drop_duplicates(subset = "SN",ignore_index =True)
genderdemos=genderdemos["Gender"].value_counts().to_frame()
genderdemos["Percentage of Players"]= genderdemos["Gender"]/sum(genderdemos["Gender"])
genderdemos.head().style.format({"Percentage of Players": "{:.2%}"})

Unnamed: 0,Gender,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


 # Purchasing Analysis (Gender)

In [176]:
genderpurchasing = rawdata.groupby("Gender").agg({"Purchase ID":"count","Price":["mean","sum"]})
#genderpurchasing["Average Price"] = rawdata.groupby("Gender")["Price"].mean().to_frame()
#genderpurchasing["Total Sales"] = rawdata.groupby("Gender")["Price"].sum().to_frame()
#gendercustomermean = rawdata.groupby("SN").agg({"Gender":'first','Price': 'sum'})
#genderpurchasing["Average Per Person"] = gendercustomermean.groupby("Gender").mean()

display(genderpurchasing)
genderpurchasing.columns

Unnamed: 0_level_0,Purchase ID,Price,Price
Unnamed: 0_level_1,count,mean,sum
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,113,3.203009,361.94
Male,652,3.017853,1967.64
Other / Non-Disclosed,15,3.346,50.19


MultiIndex([('Purchase ID', 'count'),
            (      'Price',  'mean'),
            (      'Price',   'sum')],
           )

In [None]:
# Age Demographics (Need to introduce variable reseting at the start of the cell to prevent errors) Need to format, clear temporary dataframes

In [None]:
agedata = rawdata.drop_duplicates(subset = "SN",ignore_index =True)
agebins = pd.IntervalIndex.from_tuples([(0,9),(10,14),(15,19),(20,24),(25,29),(30,34),(35,39),(40,200)],closed = "both")
binned_ages_df = pd.cut(agedata["Age"],agebins,ordered = True).to_frame()
agedemographics =binned_ages_df.value_counts().to_frame()
agedemographics["Percent of Players"]=agedemographics.iloc[:,0]/sum(agedemographics.iloc[:,0])
display(agedemographics)

In [None]:
# Purchasing Analysis (Age) Needs formatting and documentation, consider declaring variables to make the code easier to read, and spanning multiple rows with long lines
age_group_stats = rawdata.copy()
sliceframe = pd.cut(rawdata["Age"],agebins,ordered = True)
age_group_stats["Age"] = sliceframe
age_group_stats = age_group_stats.groupby("Age").agg({"Purchase ID":'count',"Price":'mean',})
age_group_stats["Total Purchase Value"] = age_group_stats["Purchase ID"] * age_group_stats["Price"]
age_group_stats["Average Per Person"] = rawdata.groupby("SN").agg({"Age":'first','Price': 'sum'}).groupby("Age").mean()
age_group_stats.head(10)


# Top Spenders,need to format the output

In [181]:
topspenders = rawdata.groupby("SN").agg({"Purchase ID":'count',"Price":'mean'})
topspenders["Total Purchase Value"] = topspenders["Purchase ID"]*topspenders["Price"] 
topspenders_sorted = topspenders.sort_values('Total Purchase Value',ascending = False)
topspenders_sorted.head()

Unnamed: 0_level_0,Purchase ID,Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lisosia93,5,3.792,18.96
Idastidru52,4,3.8625,15.45
Chamjask73,3,4.61,13.83
Iral74,4,3.405,13.62
Iskadarya95,3,4.366667,13.1


# Most Popular Items need to format the output

In [195]:
popularitems = rawdata.groupby(["Item ID","Item Name"]).agg({"Purchase ID":'count',"Price":'mean'})
popularitems["Total Purchase Value"] = popularitems["Purchase ID"] * popularitems["Price"]
popularitems_sorted = popularitems.sort_values("Purchase ID",ascending = False)
popularitems_sorted.head()

In [207]:
# Most Profitable Items need to format the output and clean the code
profitable_items = (
    popularitems.sort_values("Total Purchase Value",ascending = False)
    .rename({"Purchase ID":"Purchase Count"})
)
profitable_items.head().style.format({"Price":"${:.2f}","Total Purchase Value":"${:.2f}"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase ID,Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,Final Critic,13,$4.61,$59.99
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
82,Nirvana,9,$4.90,$44.10
145,Fiery Glass Crusader,9,$4.58,$41.22
103,Singed Scalpel,8,$4.35,$34.80
