# C2C Fashion Store Data Study

In [5]:
import pandas as pd
df=pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRddBCGuZIZZBlpOx2WR_sD6IN9HF-Ah04p-BTSrOoAENWfP8zPQxm5TrKSr7ljABnZVhnPA2N_H3HI/pub?gid=304981720&single=true&output=csv")

In [6]:
df.head()

Unnamed: 0,identifierHash,type,country,language,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,productsPassRate,...,civilityTitle,hasAnyApp,hasAndroidApp,hasIosApp,hasProfilePicture,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears,countryCode
0,-1.097895e+18,user,Royaume-Uni,en,147,10,77,26,174,74.0,...,mr,True,False,True,True,11,3196,106.53,8.88,gb
1,2.347567e+18,user,Monaco,en,167,8,2,19,170,99.0,...,mrs,True,False,True,True,12,3204,106.8,8.9,mc
2,6.870941e+18,user,France,fr,137,13,60,33,163,94.0,...,mrs,True,False,True,False,11,3203,106.77,8.9,fr
3,-4.640273e+18,user,Etats-Unis,en,131,10,14,122,152,92.0,...,mrs,True,False,True,False,12,3198,106.6,8.88,us
4,-5.175831e+18,user,Etats-Unis,en,167,8,0,25,125,100.0,...,mrs,False,False,False,True,22,2854,95.13,7.93,us


In [7]:
# Display basic information about the DataFrame
print(df.info())
print ("\n")

# Convert appropriate columns to numeric types
numeric_columns = ['socialNbFollowers', 'socialNbFollows', 'productsListed', 'productsSold', 'productsPassRate',
                   'socialProductsLiked', 'productsWished', 'productsBought', 'daysSinceLastLogin', 'seniorityAsMonths']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
print ("\n")
# Display summary statistics
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98913 entries, 0 to 98912
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   identifierHash       98913 non-null  float64
 1   type                 98913 non-null  object 
 2   country              98913 non-null  object 
 3   language             98913 non-null  object 
 4   socialNbFollowers    98913 non-null  int64  
 5   socialNbFollows      98913 non-null  int64  
 6   socialProductsLiked  98913 non-null  int64  
 7   productsListed       98913 non-null  int64  
 8   productsSold         98913 non-null  int64  
 9   productsPassRate     98913 non-null  float64
 10  productsWished       98913 non-null  int64  
 11  productsBought       98913 non-null  int64  
 12  gender               98913 non-null  object 
 13  civilityGenderId     98913 non-null  int64  
 14  civilityTitle        98913 non-null  object 
 15  hasAnyApp            98913 non-null 

In [4]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values with appropriate strategies
# missing values are replaced with 0. This is because it's logical to assume that if data is missing, it could mean no activity or interaction in those fields.

df = df.fillna({
    'socialNbFollowers': 0,
    'socialNbFollows': 0,
    'productsListed': 0,
    'productsSold': 0,
    'productsPassRate': df['productsPassRate'].mean(),
    'socialProductsLiked': 0,
    'productsWished': 0,
    'productsBought': 0,
    'daysSinceLastLogin': df['daysSinceLastLogin'].median(),
    'seniorityAsMonths': df['seniorityAsMonths'].median()
})

# Remove duplicates
df = df.drop_duplicates()

# Verify changes
print(df.isnull().sum().sum())
print(df.info())


identifierHash         0
type                   0
country                0
language               0
socialNbFollowers      0
socialNbFollows        0
socialProductsLiked    0
productsListed         0
productsSold           0
productsPassRate       0
productsWished         0
productsBought         0
gender                 0
civilityGenderId       0
civilityTitle          0
hasAnyApp              0
hasAndroidApp          0
hasIosApp              0
hasProfilePicture      0
daysSinceLastLogin     0
seniority              0
seniorityAsMonths      0
seniorityAsYears       0
countryCode            0
dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 98913 entries, 0 to 98912
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   identifierHash       98913 non-null  float64
 1   type                 98913 non-null  object 
 2   country              98913 non-null  object 
 3   language             

In [8]:
# Summary statistics
print(df.describe())

print("--------------------")

# Distribution of users by country
country_counts = df['countryCode'].value_counts()
print(country_counts)

# Distribution of users by gender
gender_counts = df['gender'].value_counts()
print(gender_counts)

#Identifying Single Touch Countries [with only one or two Occurences]
single_touch_countries = country_counts[country_counts < 100].index
df['countryCode'] = df['countryCode'].apply(lambda x: 'Singletouchpoints' if x in single_touch_countries else x)

# Count the number of purchases per country with all Single purchasses clubbed together
country_counts = df['countryCode'].value_counts()
print(country_counts)

# Count the number of Singletouchpoints
singletouchpoints_count = df[df['countryCode'] == 'Singletouchpoints'].shape[0]
print(singletouchpoints_count)


       identifierHash  socialNbFollowers  socialNbFollows  \
count    9.891300e+04       98913.000000     98913.000000   
mean    -6.692039e+15           3.432269         8.425677   
std      5.330807e+18           3.882383        52.839572   
min     -9.223101e+18           3.000000         0.000000   
25%     -4.622895e+18           3.000000         8.000000   
50%     -1.337989e+15           3.000000         8.000000   
75%      4.616388e+18           3.000000         8.000000   
max      9.223331e+18         744.000000     13764.000000   

       socialProductsLiked  productsListed  productsSold  productsPassRate  \
count         98913.000000    98913.000000  98913.000000      98913.000000   
mean              4.420743        0.093304      0.121592          0.812303   
std             181.030569        2.050144      2.126895          8.500205   
min               0.000000        0.000000      0.000000          0.000000   
25%               0.000000        0.000000      0.000000    

In [9]:
#Identifying Active and Churned Users
# Define active and churned users
active_users = df[df['daysSinceLastLogin'] <= 30]
churned_users = df[df['daysSinceLastLogin'] > 180]

# Count active and churned users
num_active_users = len(active_users)
num_churned_users = len(churned_users)

print(f"Number of Active Users: {num_active_users}")
print(f"Number of Churned Users: {num_churned_users}")

# Analyze characteristics of active vs. churned users
active_stats = active_users.describe()
churned_stats = churned_users.describe()

print(active_stats)
print(churned_stats)

Number of Active Users: 3999
Number of Churned Users: 89330
       identifierHash  socialNbFollowers  socialNbFollows  \
count    3.999000e+03        3999.000000      3999.000000   
mean     1.610274e+17           6.811453        14.193798   
std      5.351444e+18          16.116042       225.595138   
min     -9.221801e+18           3.000000         0.000000   
25%     -4.388727e+18           3.000000         8.000000   
50%      1.636080e+17           3.000000         8.000000   
75%      4.749572e+18           6.000000         9.000000   
max      9.221188e+18         744.000000     13764.000000   

       socialProductsLiked  productsListed  productsSold  productsPassRate  \
count          3999.000000     3999.000000   3999.000000       3999.000000   
mean             69.424856        1.807452      2.152538         12.116029   
std             883.554069        9.605861      9.765983         30.777281   
min               0.000000        0.000000      0.000000          0.000000   


In [10]:
#The data reveals a significant number of customers churn, which is understandable given that many users only sell and buy one or two items. To confirm this hypothesis, let's investigate whether these churning customers engage in any buying or selling activity before leaving the platform.
# Analyze their selling activity
churned_selling_activity = churned_users['productsSold'].sum()
print(f"Total products sold by churned customers: {churned_selling_activity}")

# Analyze their buying activity
churned_buying_activity = churned_users['productsBought'].sum()
print(f"Total products bought by churned customers: {churned_buying_activity}")

# Check the number of churned customers who have sold or bought at least one product
churned_users_with_activity = churned_users[(churned_users['productsSold'] > 0) | (churned_users['productsBought'] > 0)]
num_churned_users_with_activity = churned_users_with_activity.shape[0]
print(f"Number of churned users who have sold or bought at least one product: {num_churned_users_with_activity}")

# Percentage of churned customers with activity
percentage_with_activity = (num_churned_users_with_activity / churned_users.shape[0]) * 100
print(f"Percentage of churned users who have sold or bought at least one product: {percentage_with_activity:.2f}%")

Total products sold by churned customers: 1609
Total products bought by churned customers: 4391
Number of churned users who have sold or bought at least one product: 3436
Percentage of churned users who have sold or bought at least one product: 3.85%


In [None]:
# A vast majority of churned users (around 96.14%) have not engaged in buying or selling products. This suggests low engagement levels among users who eventually churn and our hypothesis was wrong.

In [11]:
#User segmentation Using Clustering 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


# Select features for clustering
features = df[['socialNbFollowers', 'socialNbFollows', 'productsListed', 'productsSold', 'productsWished', 
               'productsBought', 'daysSinceLastLogin', 'seniorityAsMonths']]

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

# Analyze the clusters
cluster_stats = df.groupby('cluster').mean()
print(cluster_stats)

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='productsListed', y='productsSold', hue='cluster', data=df, palette='viridis')
plt.title('User Segmentation')
plt.show()


AttributeError: 'NoneType' object has no attribute 'split'

In [23]:
pip install streamlit


Collecting streamlitNote: you may need to restart the kernel to use updated packages.


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

spyder 4.1.5 requires pyqt5<5.13; python_version >= "3", which is not installed.
spyder 4.1.5 requires pyqtwebengine<5.13; python_version >= "3", which is not installed.



  Downloading streamlit-1.35.0-py2.py3-none-any.whl (8.6 MB)
Collecting tenacity<9,>=8.1.0
  Downloading tenacity-8.4.1-py3-none-any.whl (27 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
Collecting blinker<2,>=1.0.0
  Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Collecting rich<14,>=10.14.0
  Downloading rich-13.7.1-py3-none-any.whl (240 kB)
Collecting pyarrow>=7.0
  Downloading pyarrow-16.1.0-cp38-cp38-win_amd64.whl (25.9 MB)
Collecting pandas<3,>=1.3.0
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl (10.8 MB)
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
Collecting altair<6,>=4.0
  Downloading altair-5.3.0-py3-none-any.whl (857 kB)
Collecting cachetools<6,>=4.0
  Downloading cachetools-5.3.3-py3-none-any.whl (9.3 kB)
Collecting typing-extensions<5,>=4.3.0
  Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting protobuf<5,>=3.20
  Downloading protobuf-4.25.3-cp

In [2]:
pip install streamlit


Note: you may need to restart the kernel to use updated packages.


In [3]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
# Title
st.title("E-commerce User Insights")

# Summary statistics
st.header("Summary Statistics")
st.write(df.describe())

# Distribution of users by country
st.header("Distribution of Users by Country")
country_counts = df['countryCode'].value_counts()
st.bar_chart(country_counts)

# Distribution of users by gender
st.header("Distribution of Users by Gender")
gender_counts = df['gender'].value_counts()
st.bar_chart(gender_counts)

# Active and Churned Users Analysis
st.header("Active and Churned Users Analysis")
active_users = df[df['daysSinceLastLogin'] <= 30]
churned_users = df[df['daysSinceLastLogin'] > 180]
st.write("Active users: ", len(active_users))
st.write("Churned users: ", len(churned_users))

# Common themes in active users
st.header("Common Themes in Active Users")
st.write(active_users.describe())

# Common themes in churned users
st.header("Common Themes in Churned Users")
st.write(churned_users.describe())

# Average likes before a product is sold
st.header("Average Likes Before a Product is Sold")
average_likes_before_sold = df[df['productsSold'] > 0]['socialProductsLiked'].mean()
st.write("Average likes before a product is sold: ", average_likes_before_sold)


# E-commerce User Insights

## Summary Statistics

Unnamed: 0,identifierHash,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,productsPassRate,productsWished,productsBought,civilityGenderId,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears
count,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0,98913.0
mean,-6692039000000000.0,3.432269,8.425677,4.420743,0.093304,0.121592,0.812303,1.562595,0.171929,1.773993,581.291236,3063.77187,102.125583,8.510424
std,5.330807e+18,3.882383,52.839572,181.030569,2.050144,2.126895,8.500205,25.192793,2.332266,0.428679,208.855888,168.298621,5.609735,0.467863
min,-9.223101e+18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,2852.0,95.07,7.92
25%,-4.622895e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,572.0,2857.0,95.23,7.94
50%,-1337989000000000.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,694.0,3196.0,106.53,8.88
75%,4.616388e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,702.0,3201.0,106.7,8.89
max,9.223331e+18,744.0,13764.0,51671.0,244.0,174.0,100.0,2635.0,405.0,3.0,709.0,3205.0,106.83,8.9


## Distribution of Users by Country

2024-06-20 00:32:00.338 
  command:

    streamlit run C:\Users\aumda\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


## Distribution of Users by Gender

## Active and Churned Users Analysis

Active users: 

3999

Churned users: 

89330

## Common Themes in Active Users

Unnamed: 0,identifierHash,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,productsPassRate,productsWished,productsBought,civilityGenderId,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears
count,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0
mean,1.610274e+17,6.811453,14.193798,69.424856,1.807452,2.152538,12.116029,22.677669,2.355589,1.823206,16.967992,3062.949237,102.098177,8.508072
std,5.351444e+18,16.116042,225.595138,883.554069,9.605861,9.765983,30.777281,109.719604,10.729802,0.429061,5.78703,168.580727,5.619161,0.468666
min,-9.221801e+18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,2852.0,95.07,7.92
25%,-4.388727e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,2857.0,95.23,7.94
50%,1.63608e+17,3.0,8.0,6.0,0.0,0.0,0.0,1.0,0.0,2.0,15.0,3196.0,106.53,8.88
75%,4.749572e+18,6.0,9.0,29.0,0.0,0.0,0.0,6.0,1.0,2.0,21.0,3201.0,106.7,8.89
max,9.221188e+18,744.0,13764.0,51671.0,244.0,174.0,100.0,2635.0,405.0,3.0,30.0,3205.0,106.83,8.9


## Common Themes in Churned Users

Unnamed: 0,identifierHash,socialNbFollowers,socialNbFollows,socialProductsLiked,productsListed,productsSold,productsPassRate,productsWished,productsBought,civilityGenderId,daysSinceLastLogin,seniority,seniorityAsMonths,seniorityAsYears
count,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0,89330.0
mean,-1.507444e+16,3.228938,8.045752,0.938945,0.011194,0.018012,0.166641,0.374902,0.049155,1.768208,637.134557,3064.05792,102.135117,8.511223
std,5.332793e+18,1.193795,0.83048,15.338327,0.418681,0.403141,3.862292,4.68938,0.508579,0.429004,125.845855,168.228241,5.60739,0.467668
min,-9.223101e+18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,181.0,2852.0,95.07,7.92
25%,-4.637017e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,648.0,2857.0,95.23,7.94
50%,-1.347877e+16,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,696.0,3196.0,106.53,8.88
75%,4.615968e+18,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,702.0,3201.0,106.7,8.89
max,9.223331e+18,54.0,117.0,2823.0,102.0,63.0,100.0,580.0,73.0,3.0,709.0,3205.0,106.83,8.9


## Average Likes Before a Product is Sold

Average likes before a product is sold: 

70.34970530451866