## Part 1: Explore the Data

Import the data and use Pandas to learn more about the dataset.

In [65]:
import pandas as pd

df = pd.read_csv('Resources/client_dataset.csv')

df.head()

Unnamed: 0,first,last,job,phone,email,client_id,order_id,order_date,order_week,order_year,item_id,category,subcategory,unit_price,unit_cost,unit_weight,qty,line_number
0,Donald,Harding,Immunologist,793-904-7725x39308,harding.donald.7185@sullivan.com,58515,8953482,2023-04-28,17,2023,EUD29711-63-6U,decor,wall art,1096.8,762.71,7.5,105,1
1,Tiffany,Myers,Music therapist,201.442.4543x942,myers.t.6537@ferguson-johnson.net,37609,8069089,2023-05-19,20,2023,XDA18116-89-4A,consumables,pens,24.95,15.09,1.49,21,0
2,Shannon,Watson,Immunologist,687.737.9424x8503,swatson8146@payne.net,57113,1902144,2023-01-29,4,2023,ABE59463-05-7E,software,project management,13.52,7.86,1.68,39,6
3,Nathan,Baker,Accounting technician,827-788-8123x012,bakernathan@benson.com,46554,9031802,2023-04-25,17,2023,ZMM00836-65-0C,consumables,pens,36.42,24.85,1.23,29,3
4,Christina,Schwartz,Chiropractor,265-829-3643,christinaschwartz9252@mcconnell.com,92089,1322274,2023-05-28,21,2023,BZX55559-12-3X,consumables,misc,195.1,108.17,46.43,20,1


In [66]:
df.shape


(54639, 18)

In [67]:
# View the column names in the data

df.columns
df.dtypes


first           object
last            object
job             object
phone           object
email           object
client_id        int64
order_id         int64
order_date      object
order_week       int64
order_year       int64
item_id         object
category        object
subcategory     object
unit_price     float64
unit_cost      float64
unit_weight    float64
qty              int64
line_number      int64
dtype: object

In [68]:
# Use the describe function to gather some basic statistics

df.describe()


Unnamed: 0,client_id,order_id,order_week,order_year,unit_price,unit_cost,unit_weight,qty,line_number
count,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0
mean,54837.869416,5470190.0,11.359139,2022.993064,136.267207,99.446073,5.004116,570.2646,2.979667
std,25487.438231,2599807.0,7.023499,0.082997,183.873135,133.164267,5.326599,18795.52,2.43632
min,10033.0,1000886.0,1.0,2022.0,0.01,0.01,0.0,0.0,0.0
25%,33593.0,3196372.0,6.0,2023.0,20.8,14.84,1.45,32.0,1.0
50%,53305.0,5496966.0,11.0,2023.0,68.31,49.89,3.24,68.0,3.0
75%,78498.0,7733869.0,17.0,2023.0,173.16,125.57,6.89,170.0,5.0
max,99984.0,9998480.0,52.0,2023.0,1396.23,846.27,46.43,3958244.0,9.0


In [69]:
# Check if 'age' column exists in the DataFrame
if 'age' in df.columns:
    df['age'].describe()
else:
    print("The 'age' column does not exist in the DataFrame.")


The 'age' column does not exist in the DataFrame.


In [70]:
# What three item categories had the most entries?

if 'category' in df.columns:
    top_categories = df['category'].value_counts().head(3)
    top_categories
else:
    print("The 'category' column does not exist in the DataFrame.")
print(top_categories)


category
consumables    23538
furniture      11915
software        8400
Name: count, dtype: int64


In [71]:
# For the category with the most entries,
# which subcategory had the most entries?

if 'category' in df.columns and 'subcategory' in df.columns:
    top_category = df['category'].value_counts().idxmax()
    subcategory_counts = df[df['category'] == top_category]['subcategory'].value_counts()
    top_subcategory = subcategory_counts.idxmax()
    top_subcategory
else:
    print("The 'category' or 'subcategory' column does not exist in the DataFrame.")

print(top_category)
print(top_subcategory) 
    

consumables
bathroom supplies


In [72]:
# Which five clients had the most entries in the data?

top_clients = df.groupby(['first', 'last']).size().reset_index(name='count')
top_clients = top_clients.sort_values(by='count', ascending=False).head(5)
top_clients


Unnamed: 0,first,last,count
434,Jessica,Reyes,220
54,Angela,Everett,211
141,Bryan,Myers,209
22,Alexandra,Young,207
537,Kendra,Garrett,207


In [73]:
print(top_clients.columns)

Index(['first', 'last', 'count'], dtype='object')


In [94]:
# Store the client ids of those top 5 clients in a list.

top_clients_ids = df.groupby(['first', 'last'])['client_id'].unique().apply(lambda x: x[0]).head(5).tolist()
top_clients_ids



[19820, 92721, 44673, 21134, 85298]

In [95]:
def check_client_ids(df, top_clients_ids):
    return df['client_id'].isin(top_clients_ids)

# Use the function
print(check_client_ids(df, top_clients_ids))
for client_id in top_clients_ids:
    if client_id in df['client_id'].values:
        print(f"Value {client_id} is present in client_id.")
    else:
        print(f"Value {client_id} is not present in client_id.")


0        False
1        False
2        False
3        False
4        False
         ...  
54634    False
54635    False
54636    False
54637    False
54638    False
Name: client_id, Length: 54639, dtype: bool
Value 19820 is present in client_id.
Value 92721 is present in client_id.
Value 44673 is present in client_id.
Value 21134 is present in client_id.
Value 85298 is present in client_id.


In [75]:
# How many total units (the qty column) did the
# client with the most entries order order?

if 'client_id' in df.columns and 'qty' in df.columns:
    client_most_entries = df['client_id'].value_counts().idxmax()
    total_units = df[df['client_id'] == client_most_entries]['qty'].sum()
    total_units
else:
    print("The 'client_id' or 'qty' column does not exist in the DataFrame.")
print(total_units)

64313


## Part 2: Transform the Data
Do we know that this client spent the more money than client 66037? If not, how would we find out? Transform the data using the steps below to prepare it for analysis.

In [97]:
# Create a column that calculates the 
# subtotal for each line using the unit_price
# and the qty

df['subtotal'] = df['unit_price'] * df['qty']
print(df[[ 'unit_price', 'qty' , 'subtotal']])


       unit_price   qty   subtotal
0         1096.80   105  115164.00
1           24.95    21     523.95
2           13.52    39     527.28
3           36.42    29    1056.18
4          195.10    20    3902.00
...           ...   ...        ...
54634       83.13    33    2743.29
54635      206.59    47    9709.73
54636       65.66   475   31188.50
54637        1.48   112     165.76
54638        3.01  1031    3103.31

[54639 rows x 3 columns]


In [98]:
# Create a column for shipping price.
# Assume a shipping price of $7 per pound
# for orders over 50 pounds and $10 per
# pound for items 50 pounds or under.


# Calculate the shipping price based on weight
df['shipping_price'] = df['unit_weight'].apply(lambda x: 7 if x < 50 else 10)

# Print order id, weight, and shipping price
print(df[['order_id', 'unit_weight', 'shipping_price']])




       order_id  unit_weight  shipping_price
0       8953482         7.50               7
1       8069089         1.49               7
2       1902144         1.68               7
3       9031802         1.23               7
4       1322274        46.43               7
...         ...          ...             ...
54634   9021716         2.25               7
54635   6290153        11.70               7
54636   8692622         4.16               7
54637   7592730        18.04               7
54638   7489403         2.07               7

[54639 rows x 3 columns]


In [99]:
# Create a column for the total price
# using the subtotal and the shipping price
# along with a sales tax of 9.25%

df['total_price'] = df['subtotal'] + df['shipping_price']
df['total_price'] = df['total_price'] + (df['total_price'] * 0.0925)

# Print order id, subtotal, shipping price, and total price
print(df[['order_id', 'subtotal', 'shipping_price', 'total_price']])


       order_id   subtotal  shipping_price    total_price
0       8953482  115164.00               7  125824.317500
1       8069089     523.95               7     580.062875
2       1902144     527.28               7     583.700900
3       9031802    1056.18               7    1161.524150
4       1322274    3902.00               7    4270.582500
...         ...        ...             ...            ...
54634   9021716    2743.29               7    3004.691825
54635   6290153    9709.73               7   10615.527525
54636   8692622   31188.50               7   34081.083750
54637   7592730     165.76               7     188.740300
54638   7489403    3103.31               7    3398.013675

[54639 rows x 4 columns]


In [100]:
# Create a column for the cost
# of each line using unit cost, qty, and
# shipping price (assume the shipping cost
# is exactly what is charged to the client).

df['cost'] = df['unit_cost'] * df['qty'] + df['shipping_price']
print(df[['order_id', 'unit_cost', 'qty', 'shipping_price', 'cost']])

       order_id  unit_cost   qty  shipping_price      cost
0       8953482     762.71   105               7  80091.55
1       8069089      15.09    21               7    323.89
2       1902144       7.86    39               7    313.54
3       9031802      24.85    29               7    727.65
4       1322274     108.17    20               7   2170.40
...         ...        ...   ...             ...       ...
54634   9021716      51.60    33               7   1709.80
54635   6290153     175.46    47               7   8253.62
54636   8692622      57.31   475               7  27229.25
54637   7592730       1.22   112               7    143.64
54638   7489403       1.61  1031               7   1666.91

[54639 rows x 5 columns]


In [101]:
# Create a column for the profit of
# each line using line cost and line price

df['profit'] = df['total_price'] - df['cost']
print(df[['order_id', 'total_price', 'cost', 'profit']])

       order_id    total_price      cost        profit
0       8953482  125824.317500  80091.55  45732.767500
1       8069089     580.062875    323.89    256.172875
2       1902144     583.700900    313.54    270.160900
3       9031802    1161.524150    727.65    433.874150
4       1322274    4270.582500   2170.40   2100.182500
...         ...            ...       ...           ...
54634   9021716    3004.691825   1709.80   1294.891825
54635   6290153   10615.527525   8253.62   2361.907525
54636   8692622   34081.083750  27229.25   6851.833750
54637   7592730     188.740300    143.64     45.100300
54638   7489403    3398.013675   1666.91   1731.103675

[54639 rows x 4 columns]


## Part 3: Confirm your work
You have email receipts showing that the total prices for 3 orders. Confirm that your calculations match the receipts. Remember, each order has multiple lines.

Order ID 2742071 had a total price of \$152,811.89

Order ID 2173913 had a total price of \$162,388.71

Order ID 6128929 had a total price of \$923,441.25


In [102]:
# Check your work using the totals above
order_ids = [2742071, 2173913, 6128929]
receipts = df[df['order_id'].isin(order_ids)][['order_id', 'total_price']]
receipts

# Compare email receipts with total price
email_receipts = {
    2742071: 152811.89,
    2173913: 162388.71,
    6128929: 923441.25
}

for order_id, total_price in email_receipts.items():
    if order_id in receipts['order_id'].values:
        receipt_total_price = receipts.loc[receipts['order_id'] == order_id, 'total_price'].values[0]
        if total_price == receipt_total_price:
            print(f"Order ID {order_id} total price matches the email receipt.")
        else:
            print(f"Order ID {order_id} total price does not match the email receipt.")
    else:
        print(f"Order ID {order_id} not found in the receipts.")

# Print the total price for the orders
for order_id in order_ids:
    if order_id in receipts['order_id'].values:
        total_price = receipts.loc[receipts['order_id'] == order_id, 'total_price'].values[0]
        print(f"Total price for Order ID {order_id}: {total_price}")
    else:
        print(f"Order ID {order_id} not found in the receipts.")


Order ID 2742071 total price does not match the email receipt.
Order ID 2173913 total price does not match the email receipt.
Order ID 6128929 total price does not match the email receipt.
Total price for Order ID 2742071: 477.892275
Total price for Order ID 2173913: 8440.54575
Total price for Order ID 6128929: 9720.791875


## Part 4: Summarize and Analyze
Use the new columns with confirmed values to find the following information.

In [104]:

def check_client_ids(df, top_clients_ids):
    """
    Check if the client IDs in the DataFrame match the given list of top client IDs.

    Parameters:
    - df (DataFrame): The DataFrame containing the client IDs.
    - top_clients_ids (list): The list of top client IDs to check against.

    Returns:
    - Series: A boolean Series indicating whether each client ID is in the top client IDs list.
    """
    return df['client_id'].isin(top_clients_ids)

# Use the function
print(check_client_ids(df, top_clients_ids))

# Iterate over the top client IDs
for client_id in top_clients_ids:
    if client_id in df['client_id'].values:
        print(f"Value {client_id} is present in client_id.")
    else:
        print(f"Value {client_id} is not present in client_id.")

0        False
1        False
2        False
3        False
4        False
         ...  
54634    False
54635    False
54636    False
54637    False
54638    False
Name: client_id, Length: 54639, dtype: bool
Value 19820 is present in client_id.
Value 92721 is present in client_id.
Value 44673 is present in client_id.
Value 21134 is present in client_id.
Value 85298 is present in client_id.


In [106]:
# How much did each of the top 5 clients by quantity
# spend? Check your work from Part 1 for client ids.

top_clients_spending = df[df['client_id'].isin(top_clients_ids)].groupby('client_id')['total_price'].sum()
top_clients_spending



client_id
19820    9.098275e+05
21134    3.594165e+06
44673    2.698066e+07
85298    3.296025e+05
92721    1.142979e+06
Name: total_price, dtype: float64

In [107]:
# Create a summary DataFrame showing the totals for the
# for the top 5 clients with the following information:
# total units purchased, total shipping price,
# total revenue, and total profit. Sort by total profit.

# Calculate the totals for each client
top_clients_summary = df[df['client_id'].isin(top_clients_ids)].groupby('client_id').agg({
    'qty': 'sum',
    'shipping_price': 'sum',
    'total_price': 'sum',
    'profit': 'sum'
})

# Sort the summary DataFrame by total profit
top_clients_summary = top_clients_summary.sort_values('profit', ascending=False)

# Display the summary DataFrame
top_clients_summary


Unnamed: 0_level_0,qty,shipping_price,total_price,profit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
44673,118251,889,26980660.0,13259190.0
21134,54365,602,3594165.0,1564078.0
92721,11297,259,1142979.0,412156.8
19820,8946,189,909827.5,267946.8
85298,4948,252,329602.5,102383.5


In [109]:
# Format the data and rename the columns
# to names suitable for presentation.
# Currency should be in millions of dollars.

# Format the data
df_formatted = df.copy()

# Convert currency to millions of dollars
df_formatted['total_price'] = df_formatted['total_price'] / 1000000
df_formatted['cost'] = df_formatted['cost'] / 1000000
df_formatted['profit'] = df_formatted['profit'] / 1000000

# Rename the columns
df_formatted = df_formatted.rename(columns={'total_price': 'Total Price (Millions)',
                                            'cost': 'Cost (Millions)',
                                            'profit': 'Profit (Millions)'})

df_formatted



Unnamed: 0,first,last,job,phone,email,client_id,order_id,order_date,order_week,order_year,...,unit_price,unit_cost,unit_weight,qty,line_number,subtotal,shipping_price,Total Price (Millions),Cost (Millions),Profit (Millions)
0,Donald,Harding,Immunologist,793-904-7725x39308,harding.donald.7185@sullivan.com,58515,8953482,2023-04-28,17,2023,...,1096.80,762.71,7.50,105,1,115164.00,7,0.125824,0.080092,0.045733
1,Tiffany,Myers,Music therapist,201.442.4543x942,myers.t.6537@ferguson-johnson.net,37609,8069089,2023-05-19,20,2023,...,24.95,15.09,1.49,21,0,523.95,7,0.000580,0.000324,0.000256
2,Shannon,Watson,Immunologist,687.737.9424x8503,swatson8146@payne.net,57113,1902144,2023-01-29,4,2023,...,13.52,7.86,1.68,39,6,527.28,7,0.000584,0.000314,0.000270
3,Nathan,Baker,Accounting technician,827-788-8123x012,bakernathan@benson.com,46554,9031802,2023-04-25,17,2023,...,36.42,24.85,1.23,29,3,1056.18,7,0.001162,0.000728,0.000434
4,Christina,Schwartz,Chiropractor,265-829-3643,christinaschwartz9252@mcconnell.com,92089,1322274,2023-05-28,21,2023,...,195.10,108.17,46.43,20,1,3902.00,7,0.004271,0.002170,0.002100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54634,Lauren,Reese,Radio producer,977.877.5272x11382,late_reese_4081@montoya-chavez.org,29043,9021716,2023-04-26,17,2023,...,83.13,51.60,2.25,33,8,2743.29,7,0.003005,0.001710,0.001295
54635,Derrick,Moore,Musician,358.661.5483,derrick.moore.2602@pope.info,41908,6290153,2023-02-08,6,2023,...,206.59,175.46,11.70,47,0,9709.73,7,0.010616,0.008254,0.002362
54636,Monica,Gutierrez,Graphic designer,294.805.9100x339,gutierrezm3195@morris.org,35176,8692622,2023-03-05,9,2023,...,65.66,57.31,4.16,475,6,31188.50,7,0.034081,0.027229,0.006852
54637,Wanda,Solomon,Toxicologist,(311)767-4924,solomonwanda5962@ross.org,24485,7592730,2023-01-18,3,2023,...,1.48,1.22,18.04,112,0,165.76,7,0.000189,0.000144,0.000045


In [110]:
# Sort the updated data by "Total Profit" form highest to lowest
df_formatted.sort_values('Profit (Millions)', ascending=False, inplace=True)
df_formatted


Unnamed: 0,first,last,job,phone,email,client_id,order_id,order_date,order_week,order_year,...,unit_price,unit_cost,unit_weight,qty,line_number,subtotal,shipping_price,Total Price (Millions),Cost (Millions),Profit (Millions)
21801,Julie,Anderson,Exhibition designer,5933099462,anderson_better_4704@bell.biz,78965,3833568,2023-04-29,17,2023,...,147.66,100.11,0.09,3958244,1,5.844743e+08,7,638.538190,396.259814,2.422784e+02
15112,Dennis,Hawkins,"Social research officer, government",001-214-888-5838x7410,hawkins.dennis.5195@lee.com,85931,7263857,2023-03-05,9,2023,...,323.42,183.84,2.21,637318,3,2.061214e+08,7,225.187624,117.164548,1.080231e+02
23929,Michele,Mcdonald,Emergency planning/management officer,435-931-4939x95137,mcdonaldm317@hess-jackson.com,38473,4594154,2023-03-02,9,2023,...,674.94,568.06,2.25,219645,5,1.482472e+08,7,161.960070,124.771546,3.718852e+01
25972,Amy,Stewart,"Psychologist, educational",979.793.4884x57205,stewartamy@francis.com,86721,6057452,2023-03-21,12,2023,...,100.40,73.05,9.22,977121,3,9.810295e+07,7,107.177479,71.378696,3.579878e+01
4063,Kendra,Garrett,Pension scheme manager,808.257.2029,garrettkendra8479@jefferson.com,24741,5167984,2023-05-08,19,2023,...,461.30,247.94,1.86,122166,7,5.635518e+07,7,61.568037,30.289845,3.127819e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,Thomas,Campbell,Health and safety inspector,001-271-367-9375x56843,campbellthomas1892@peterson.biz,26601,6757056,2023-02-23,8,2023,...,30.42,17.36,7.90,0,5,0.000000e+00,7,0.000008,0.000007,6.475000e-07
27658,Zachary,Johnson,Ophthalmologist,(898)506-5200,zacharyjohnson@jacobson.net,54518,3697709,2023-03-29,13,2023,...,49.64,27.06,11.14,0,4,0.000000e+00,7,0.000008,0.000007,6.475000e-07
4112,Sheri,Norman,"Administrator, charities/voluntary organisations",(577)810-1248x6961,norman_sheri@duke.biz,51367,8101215,2023-05-25,21,2023,...,23.07,16.66,5.13,0,0,0.000000e+00,7,0.000008,0.000007,6.475000e-07
33731,Sarah,Richardson,"Exhibitions officer, museum/gallery",2056834802,richardsonsmile4695@merritt.com,79361,3370363,2023-04-19,16,2023,...,1096.80,762.71,7.50,0,4,0.000000e+00,7,0.000008,0.000007,6.475000e-07
