# LEVEL BASED PERSONA, SIMPLE SEGMENTATION AND RULE BASED CLASSIFICATION

## Project Purpose
* Our goal is to make groupings (level based persona segmentations) for customers.
* When a new customer arrives, to classify this customer by segments that we created.
* After classification, to determine which of this segments a new customer fits to.

In [None]:
### BEFORE SEGMENTATION ###

# PRICE SOURCE  SEX COUNTRY AGE
#  39   android male bra    17
#  39   android male bra    17
#  49   android male bra    17
#  29   android male tur    17
#  49   android male tur    17

### AFTER SEGMENTATION ###

# CUSTOMER_LEVEL_BASED       PRICE
#   USA_ANDROID_MALE_0_18     3917
#   BRA_ANDROID_MALE_19_25    2606
#   USA_IOS_MALE_0_18         2496
#   USA_ANDROID_FEMALE_19_25  2190
#   DEU_IOS_FEMALE_0_18       2169

In [1]:
# Essential imports
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# Reading persona dataset
df = pd.read_csv("/Users/aslihankalyonkat/Desktop/DSMLBC/datasets/persona.csv")
df.head()

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [3]:
# Shape of the df
df.shape

(5000, 5)

In [4]:
# Number of unique SOURCES and values
df["SOURCE"].value_counts()

android    2974
ios        2026
Name: SOURCE, dtype: int64

In [5]:
# Number of unique PRICES and values
df["PRICE"].value_counts()

29    1305
39    1260
49    1031
19     992
59     212
9      200
Name: PRICE, dtype: int64

In [6]:
# Total price in country breakdown
pd.DataFrame({"TOTAL_PRICE": df.groupby('COUNTRY')['PRICE'].sum()})

Unnamed: 0_level_0,TOTAL_PRICE
COUNTRY,Unnamed: 1_level_1
bra,51354
can,7730
deu,15485
fra,10177
tur,15689
usa,70225


In [7]:
# Total Price in country and source breakdown
pd.DataFrame(df.groupby(['COUNTRY', 'SOURCE'])['PRICE'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,PRICE
COUNTRY,SOURCE,Unnamed: 2_level_1
bra,android,34.387029
bra,ios,34.222222
can,android,33.330709
can,ios,33.951456
deu,android,33.869888
deu,ios,34.268817
fra,android,34.3125
fra,ios,32.776224
tur,android,36.229437
tur,ios,33.272727


In [8]:
# Total Price in country, source, sex and age breakdown
pd.DataFrame(df.groupby(['COUNTRY', 'SOURCE', 'SEX', 'AGE'])['PRICE'].sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
COUNTRY,SOURCE,SEX,AGE,Unnamed: 4_level_1
bra,android,female,15,1355
bra,android,female,16,1294
bra,android,female,17,642
bra,android,female,18,1387
bra,android,female,19,1021
...,...,...,...,...
usa,ios,male,42,242
usa,ios,male,50,156
usa,ios,male,53,68
usa,ios,male,55,29


In [9]:
# Sorting the result in descending order by price
agg_df = pd.DataFrame(df.groupby(['COUNTRY', 'SOURCE', 'SEX', 'AGE'])['PRICE'].sum()).sort_values(
    by='PRICE', ascending=False)
agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRICE
COUNTRY,SOURCE,SEX,AGE,Unnamed: 4_level_1
usa,android,male,15,3917
bra,android,male,19,2606
usa,ios,male,15,2496
usa,android,female,20,2190
deu,ios,female,16,2169


In [10]:
# Convert agg_df's indexes to variable names
agg_df.reset_index(inplace=True)
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE
0,usa,android,male,15,3917
1,bra,android,male,19,2606
2,usa,ios,male,15,2496
3,usa,android,female,20,2190
4,deu,ios,female,16,2169


In [11]:
# Creating categorical age column by age groups
agg_df["AGE_CAT"] = pd.cut(agg_df["AGE"], [0, 18, 25, 35, 50, 80], 
                           labels=['0_18', '19_25', '26_35', '36_50', '51_80'])
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT
0,usa,android,male,15,3917,0_18
1,bra,android,male,19,2606,19_25
2,usa,ios,male,15,2496,0_18
3,usa,android,female,20,2190,19_25
4,deu,ios,female,16,2169,0_18


In [12]:
# Defining persona segments by country, source, sex, age_cat
agg_df['CUSTOMER_LEVEL_BASED'] = agg_df[[col for col in agg_df.columns if col not in ["AGE", "PRICE"]]].apply(
    lambda x: "_".join([val.upper() for val in x]), axis=1)

agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT,CUSTOMER_LEVEL_BASED
0,usa,android,male,15,3917,0_18,USA_ANDROID_MALE_0_18
1,bra,android,male,19,2606,19_25,BRA_ANDROID_MALE_19_25
2,usa,ios,male,15,2496,0_18,USA_IOS_MALE_0_18
3,usa,android,female,20,2190,19_25,USA_ANDROID_FEMALE_19_25
4,deu,ios,female,16,2169,0_18,DEU_IOS_FEMALE_0_18


In [13]:
# Final table
agg_df = pd.DataFrame({"PRICE": agg_df.loc[:, ["CUSTOMER_LEVEL_BASED", "PRICE"]].groupby('CUSTOMER_LEVEL_BASED')[
    'PRICE'].mean()}).reset_index()
agg_df.head()

Unnamed: 0,CUSTOMER_LEVEL_BASED,PRICE
0,BRA_ANDROID_FEMALE_0_18,1169.5
1,BRA_ANDROID_FEMALE_19_25,1063.428571
2,BRA_ANDROID_FEMALE_26_35,327.75
3,BRA_ANDROID_FEMALE_36_50,298.166667
4,BRA_ANDROID_FEMALE_51_80,185.0


In [14]:
# Segmentation on level based personas
agg_df['SEGMENT'] = pd.qcut(agg_df['PRICE'], 4, labels=['D', 'C', 'B', 'A'])
agg_df.groupby('SEGMENT').agg({'PRICE': ['mean', 'max', 'sum']})

Unnamed: 0_level_0,PRICE,PRICE,PRICE
Unnamed: 0_level_1,mean,max,sum
SEGMENT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
D,129.740741,185.0,3503.0
C,243.640667,298.166667,6091.016667
B,363.902793,472.0,9461.472619
A,917.976801,1992.5,23867.396825


In [15]:
# Segment of 33 years old Android user Turkish woman
agg_df[agg_df['CUSTOMER_LEVEL_BASED'] == 'TUR_ANDROID_FEMALE_26_35']

Unnamed: 0,CUSTOMER_LEVEL_BASED,PRICE,SEGMENT
69,TUR_ANDROID_FEMALE_26_35,413.75,B


In [16]:
# Segment of 35 years old IOS user French woman
agg_df[agg_df['CUSTOMER_LEVEL_BASED'] == 'FRA_IOS_FEMALE_26_35']

Unnamed: 0,CUSTOMER_LEVEL_BASED,PRICE,SEGMENT
60,FRA_IOS_FEMALE_26_35,286.0,C
