In [22]:
import pandas as pd

# load the file
pymoli = "Resources/purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
pymoli_df = pd.read_csv(pymoli).dropna()
pymoli_df.head().style.hide_index()

Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,Ithergue48,24,Male,92,Final Critic,4.88
3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,Iskosia90,23,Male,131,Fury,1.44


In [23]:
# Create player demographics dataframe for later use
demographics = pymoli_df[["Gender", "SN", "Age"]].drop_duplicates()

# Create player_count variable for later use
player_count = pymoli_df["SN"].nunique()

total_players_df = pd.DataFrame({"Total Players": [player_count]}).style.hide_index()
total_players_df

Total Players
576


In [56]:
#unique values core each category
#note there are 780 purchase IDs but only 576 unique screen names(i.e. unique people that bought items)
pymoli_df.nunique()


Purchase ID    780
SN             576
Age             39
Gender           3
Item ID        179
Item Name      179
Price          145
dtype: int64

In [53]:
#Purchasing Analysis (Total)
unique_items = pymoli_df["Item ID"].nunique()
average_price = pymoli_df["Price"].mean()
number_purchases = pymoli_df["Purchase ID"].nunique()
total_revenue = pymoli_df["Price"].sum()


Values = pd.DataFrame({"Number of Unique Items": [unique_items], 
                       "Average Price": [average_price], 
                       "Number of Purchases": [number_purchases], 
                       "Total Revenue": [total_revenue]})

Values["Average Price"] = Values["Average Price"].map('${:,.2f}'.format)
Values["Total Revenue"] = Values["Total Revenue"].map('${:,.2f}'.format)

Values

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


In [50]:
#Gender data frame, gives you purchase count by gender 
gender = demographics["Gender"].value_counts()
gender

Male                     484
Female                    81
Other / Non-Disclosed     11
Name: Gender, dtype: int64

In [58]:
#Gender Demographics
percent_male_players = (484 / 576)*100
count_male_players = 484
percent_female_players = (81 / 576)*100
count_female_players = 81
percent_other_players = (11 / 576)*100
count_other_players = 11

#create gender values data frame
gender_values = pd.DataFrame(columns = ["Total Players", "Percentage of Players"], 
                             index = ["Male", "Female", "Other / Non-Disclosed"])

gender_values.loc["Male"] = [count_male_players, percent_male_players]
gender_values.loc["Female"] = [count_female_players, percent_female_players]
gender_values.loc["Other / Non-Disclosed"] = [count_other_players, percent_other_players]

gender_values["Percentage of Players"] = gender_values["Percentage of Players"].map('{:,.2f}%'.format)

gender_values

Unnamed: 0,Total Players,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


In [38]:
#Gender data frame, gives you purchase count by gender 
gender_count = pymoli_df["Gender"].value_counts()
gender_count

Male                     652
Female                   113
Other / Non-Disclosed     15
Name: Gender, dtype: int64

In [65]:
#create data frame summaries for each gender
male_summary = pymoli_df.loc[pymoli_df["Gender"] == "Male", :]
female_summary = pymoli_df.loc[pymoli_df["Gender"] == "Male", :]
other_summary = pymoli_df.loc[pymoli_df["Gender"] == "Male", :]


In [82]:
# Purchasing Analysis (Gender)
purchase_count_male = 652
purchase_count_female = 113
purchase_count_other = 15
avg_price_male = male_summary[["Price"]].mean()
male_total_value = male_summary[["Price"]].sum()
male_avg_total = male_total_value / purchase_count_male
avg_price_female = female_summary[["Price"]].mean()
female_total_value = female_summary[["Price"]].sum()
female_avg_total = female_total_value / purchase_count_female
avg_price_other = other_summary[["Price"]].mean()
other_total_value = other_summary[["Price"]].sum()
other_avg_total = other_total_value / purchase_count_other



purchase_summary = pd.DataFrame(columns = ["Purchase Count", "Avg Purchase Price", 
                                           "Total Purchaase Value", "Avg Total Purchase per Person"], 
                             index = ["Gender", "Male", "Female", "Other / Non-Disclosed"])

#GET RID OF NAN IN GENDER ROW
purchase_summary.loc["Male"] = [purchase_count_male, avg_price_male, male_total_value, male_avg_total]
purchase_summary.loc["Female"] = [purchase_count_female, avg_price_female, female_total_value, female_avg_total]
purchase_summary.loc["Other / Non-Disclosed"] = [purchase_count_other, avg_price_other, 
                                                 other_total_value, other_avg_total]


purchase_summary



Unnamed: 0,Purchase Count,Avg Purchase Price,Total Purchaase Value,Avg Total Purchase per Person
Gender,,,,
Male,652.0,Price 3.017853 dtype: float64,Price 1967.64 dtype: float64,Price 3.017853 dtype: float64
Female,113.0,Price 3.017853 dtype: float64,Price 1967.64 dtype: float64,Price 17.412743 dtype: float64
Other / Non-Disclosed,15.0,Price 3.017853 dtype: float64,Price 1967.64 dtype: float64,Price 131.176 dtype: float64


In [78]:
purchase_summary.dtypes

Purchase Count                   object
Avg Purchase Price               object
Total Purchaase Value            object
Avg Total Purchase per Person    object
dtype: object

In [None]:
#Age Demographics

#USE BINS!!!!!



# Create bins in which to place values based upon TED Talk views
#bins = [0, 199999, 399999, 599999, 799999, 999999,
        #1999999, 2999999, 3999999, 4999999, 50000000]

# Create labels for these bins
#group_labels = ["0 to 199k", "200k to 399k", "400k to 599k", "600k to 799k", "800k to 999k", "1mil to 2mil",
               # "2mil to 3mil", "3mil to 4mil", "4mil to 5mil", "5mil to 50mil"]
    
    
# Slice the data and place it into bins
#pd.cut(ted_df["views"], bins, labels=group_labels).head()