# Import libraries and read raw file

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
# Load data into a pandas DataFrame
df = pd.read_csv('/kaggle/input/ecommerce-users-of-a-french-c2c-fashion-store/6M-0K-99K.users.dataset.public.csv')


## Generate a Pandas Profiling Report
### It is a must-have for any initial analysis!

In [None]:
# Generate a profile report
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

# Save the report as an HTML file
#profile.to_file('report.html')

Pandas report revealed there are 200 countries but only 199 country codes. Let's dive into this issue and find ways to resolve it.

# Fix some issues (inaccuracies in countries data, typos)
## Prepare for exporting for Tableau and dive a bit more into potential insights

In [None]:
# Filter characteristic columns
user_char = df[["country", "countryCode"]]

# Number of unique values in each column
unique_counts = user_char.nunique()

# Filter the countries sharing the same country code
shared_code = (
    df.groupby("countryCode")
    .agg(n=pd.NamedAgg(column="country", aggfunc="nunique"))
    .query("n > 1")
)

unique_countries = df[df["countryCode"].isin(shared_code.index)]["country"].unique()
unique_countries
# we can drop country column now

In [None]:
#let's drop some non-value-added columns
df = df.drop(['country',"seniorityAsMonths", "seniorityAsYears", "identifierHash", "type", "civilityTitle"], axis=1)


In [None]:
#Improve readability of names
new_names = {
    'language': 'Language',
    'socialNbFollowers': 'Followers',
    'socialNbFollows': 'Following',
    'socialProductsLiked': 'Likes',
    'productsListed': 'Listings',
    'productsSold': 'Sales',
    'productsPassRate': 'PassRate',
    'productsWished': 'Wishlist',
    'productsBought': 'Purchases',
    'gender': 'Gender',
    'civilityGenderId': 'Civility',
    'hasAnyApp': 'HasApp',
    'hasAndroidApp': 'HasAndroid',
    'hasIosApp': 'HasIOS',
    'hasProfilePicture': 'HasProfilePicture',
    'daysSinceLastLogin': 'LastLogin',
    'seniority': 'Seniority',
    'countryCode': 'CountryCode'
}

df = df.rename(columns=new_names)


In [None]:
#remove the typo
max_val = df[df["LastLogin"] != 737028]["LastLogin"].max()
df["LastLogin"] = df["LastLogin"].replace(737028, max_val)

In [None]:
#save the processed file
df.to_csv('challenge.csv')

In [None]:
#who is this outlier?
df[df.Following==13764]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Extract the 'dayssincelastlogin' column
days_since_last_login = df['LastLogin']

# Create a histogram
plt.hist(days_since_last_login, bins=50, edgecolor='black')

# Add labels and title
plt.xlabel('Days Since Last Login')
plt.ylabel('Number of Users')
plt.title('Histogram of Days Since Last Login')

# Display the histogram
plt.show()


* Insight: There is a noticeable difference in the proportion of items sold and bought via the iOS platform. Out of the total items sold on the platform, 64.2% (7,727 out of 12,027) were sold through the iOS app. However, when looking at the total items bought, only 45.3% (7,668 out of 17,006) were purchased using the iOS app. This indicates that iOS users tend to sell more items on the platform compared to their buying behavior.


* The dataset may cover different timeframes for the purchased, sold, and listed items. For instance, some purchases might have been made before the start date of the dataset, while the sales and listings data only includes transactions within the dataset's timeframe.


In [None]:
#let's see how much these countires generate
df_best = df[df['CountryCode'].isin(['fr', 'it', 'gb', 'us', 'es', 'de'])]


In [None]:
#Percentage of Total
print('Sales ratio: ',np.sum(df_best.Sales)/(np.sum(df.Sales)))
print('Purchasing ratio: ',np.sum(df_best.Purchases)/(np.sum(df.Purchases)))

In [None]:
sorted_df = df.sort_values('Likes', ascending=False)



In [None]:
#Let's have a closer look at the distribution
likes_quantiles = df[['Likes','Followers','Following']].describe(percentiles=[i/200 for i in range(1, 200)])

In [None]:
likes_quantiles.tail(20)

In [None]:
sorted_df[['Likes','Followers','Following']].head(20)

In [None]:
%%HTML 
<div class='tableauPlaceholder' id='viz1681142144443' style='position: relative'><noscript><a href='#'><img alt='Final Dashboard ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;ch&#47;challengeC2C&#47;FinalDashboard&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='challengeC2C&#47;FinalDashboard' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;ch&#47;challengeC2C&#47;FinalDashboard&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1681142144443');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='1016px';vizElement.style.height='991px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='1016px';vizElement.style.height='991px';} else { vizElement.style.width='100%';vizElement.style.height='2627px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>