In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
sns.set()

df = pd.read_csv('data.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

print(df.shape)
df.head()

(1012, 15)


Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code
0,529697267522,United States,iPhone,социальные сети,2019-05-01 00:06:40,2019-05-01 00:07:06,26.0,2019-05-01,5,3,0,2019-05-01 00:06:40,9999.0,Mobile payments,0.0
1,601292388085,United States,PC,organic,2019-05-01 06:56:16,2019-05-01 07:09:18,782.0,2019-05-01,5,3,7,,,,
2,852898876338,United States,Mac,социальные сети,2019-05-01 04:30:45,2019-05-01 04:34:56,251.0,2019-05-01,5,3,4,,,,
3,998513020664,United States,iPhone,социальные сети,2019-05-01 18:53:42,2019-05-01 18:57:35,233.0,2019-05-01,5,3,18,,,,
4,240702200943,United States,Mac,социальные сети,2019-05-02 14:04:32,2019-05-02 14:09:51,319.0,2019-05-02,5,4,14,,,,


# Set Proper Data Types

In [2]:
dtype_mapping = {
    'user_id': 'int64', 
    'region': 'object', 
    'device': 'object', 
    'channel': 'object', 
    'session_start': 'datetime64[ns]', 
    'session_end': 'datetime64[ns]', 
    'sessiondurationsec': 'float64', 
    'session_date': 'datetime64[ns]', 
    'month': 'int64', 
    'day': 'int64', 
    'hour_of_day': 'int64', 
    'order_dt': 'datetime64[ns]', 
    'revenue': 'float64', 
    'payment_type': 'object', 
    'promo_code': 'float64'
}

df = df.astype(dtype_mapping)

# Drop Obvious Duplicates

In [18]:
df[df.duplicated()]

Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code
68,768710666,United States,iPhone,реклама у блогеров,2019-05-17 18:24:36,2019-05-17 18:28:05,209.0,2019-05-17,5,5,18,2019-05-17 18:24:36,9999.0,Mobile payments,0.862715
196,134214602428,United States,Android,социальные сети,2019-06-11 02:35:48,2019-06-11 03:45:47,4199.0,2019-06-11,6,2,3,2019-06-11 02:35:48,4999.0,Mobile payments,1.0
600,16254498222,,,,2019-08-25 13:21:44,2019-08-25 13:43:23,1299.0,2019-08-25,8,7,13,2019-08-25 13:21:55,4999.0,Cash,1.0


In [None]:
duplicates = df.duplicated().sum()
print('Number of complete duplicates:', duplicates)
df.drop_duplicates(inplace=True)

# Custom Functions

In [None]:
def inspect_data(df):
    """Display basic information about the DataFrame."""
    print(df.info())

    numeric_summary = df.describe(include=[np.number])
    print("\nSummary Statistics For Numeric Column:\n", numeric_summary)
    
    object_summary = df.describe(include=['object'])
    print("\nSummary Statistics For Object Column:\n", object_summary)
    
    datetime_summary = df.describe(include=['datetime'])
    print("\nSummary Statistics For Datetime Column:\n", datetime_summary)

    print("\nShape:", df.shape)

def plot_missing_values(df):
    """Visualize missing values in the DataFrame."""
    sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
    plt.title("Missing Values Heatmap")
    plt.show()

def plot_category(df, column, save=False):
    data = df[column].fillna('unknown')
    counts = data.value_counts(dropna=False)

    plt.figure(figsize=(12,6))
    sns.barplot(x=counts.index, y=counts.values, hue=counts.index,  palette="viridis", legend=False)

    for index, value in enumerate(counts.values):
        plt.text(index, value + 0.5, str(value), ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f"Counts of {column}", fontsize=24)
    plt.xlabel(column.capitalize(), fontsize=14, labelpad=10)
    plt.ylabel('Count', fontsize=14)

    if save:
        plt.savefig(fname=f'./Graphs/{column}.png', format='png', bbox_inches='tight', dpi=300)
    
    plt.show()

# Inspect Dataframe

In [None]:
inspect_data(df)

# Fix Typos

In [3]:
corrections = {
    'Frаnce': 'France',
    'Frаncе': 'France',
    'Franсe': 'France',
    'Unjted States': 'United States',
    'germany': 'Germany',
    'UК': 'UK'
}

df['region'] = df['region'].replace(corrections)

corrections = {
    'android': 'Android'
}

df['device'] = df['device'].replace(corrections)

corrections = {
    'контексная реклама': 'контекстная реклама'
}

df['channel'] = df['channel'].replace(corrections)

# Check Unique Values

In [None]:
for column in df.columns:
    print(f'----- {column.upper()} -----')
    print(df[column].value_counts(dropna=False))
    print()

# PROMO CODE

In [4]:
df['promo_code'].unique()

array([0.        ,        nan, 1.        , 0.86271506])

In [None]:
# We notice 2 unusual values for promo code

df['promo_code'].value_counts(dropna=False)

In [5]:
# Check the rows with strange promo code values

df[~df['promo_code'].isin([1, 0, np.nan])]

Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code
67,768710666,United States,iPhone,реклама у блогеров,2019-05-17 18:24:36,2019-05-17 18:28:05,209.0,2019-05-17,5,5,18,2019-05-17 18:24:36,9999.0,Mobile payments,0.862715
68,768710666,United States,iPhone,реклама у блогеров,2019-05-17 18:24:36,2019-05-17 18:28:05,209.0,2019-05-17,5,5,18,2019-05-17 18:24:36,9999.0,Mobile payments,0.862715
495,12480325582,United States,Mac,социальные сети,2019-08-10 06:26:44,2019-08-10 06:49:56,1392.0,2019-08-10,8,6,6,2019-08-10 06:26:44,4999.0,E-wallet,0.862715


In [None]:
# Let's just round the values and plot the distribution

df['promo_code'] = df['promo_code'].round()
df['promo_code'].value_counts(dropna=False)

# SESSION DURATION

In [None]:
df['sessiondurationsec'] = df['sessiondurationsec'].round()

# Handling Missing Values

In [None]:
plot_missing_values(df)

In [17]:
# Let's handle missing values in region, device and channel columns first
# Let's check users that visited the website twice

user_counts = df['user_id'].value_counts()
user_ids_twice = user_counts[user_counts == 2].index

df[df['user_id'].isin(user_ids_twice)].sort_values(by=['user_id', 'session_start']).head(6)

Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code
67,768710666,United States,iPhone,реклама у блогеров,2019-05-17 18:24:36,2019-05-17 18:28:05,209.0,2019-05-17,5,5,18,2019-05-17 18:24:36,9999.0,Mobile payments,0.862715
68,768710666,United States,iPhone,реклама у блогеров,2019-05-17 18:24:36,2019-05-17 18:28:05,209.0,2019-05-17,5,5,18,2019-05-17 18:24:36,9999.0,Mobile payments,0.862715
599,16254498222,,,,2019-08-25 13:21:44,2019-08-25 13:43:23,1299.0,2019-08-25,8,7,13,2019-08-25 13:21:55,4999.0,Cash,1.0
600,16254498222,,,,2019-08-25 13:21:44,2019-08-25 13:43:23,1299.0,2019-08-25,8,7,13,2019-08-25 13:21:55,4999.0,Cash,1.0
408,61219447121,UK,PC,контекстная реклама,2019-07-23 03:37:05,2019-07-23 03:43:28,383.0,2019-07-23,7,2,3,2019-07-23 03:37:11,4999.0,Cash,0.0
564,61219447121,UK,PC,контекстная реклама,2019-08-18 08:16:27,2019-08-18 08:28:59,752.0,2019-08-18,8,7,8,2019-08-18 08:16:33,4999.0,Cash,0.0


In [11]:
# As we can see, when a user visits website a second time, the values are NaN
# We assume that the user used the same values for region, device and channel

r_d_c = ['region', 'device', 'channel']
df[r_d_c] = df.groupby('user_id')[r_d_c].transform(lambda x: x.ffill())

In [12]:
# Is there any NaN values left in those columns?

df[df[r_d_c].isna().all(axis=1)]

Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code
599,16254498222,,,,2019-08-25 13:21:44,2019-08-25 13:43:23,1299.0,2019-08-25,8,7,13,2019-08-25 13:21:55,4999.0,Cash,1.0
600,16254498222,,,,2019-08-25 13:21:44,2019-08-25 13:43:23,1299.0,2019-08-25,8,7,13,2019-08-25 13:21:55,4999.0,Cash,1.0


In [None]:
# Yes. We can handle this record by replacing NaN values by their respective mode

# METHOD 1
# for col in ['region', 'device', 'channel']:
#     df[col] = df[col].fillna(df[col].mode()[0])

# METHOD 2
df.fillna({col: df[col].mode()[0] for col in r_d_c}, inplace=True)

In [None]:
# Check again for NaN values

plot_missing_values(df)

# Handle Outliers in Revenue Column

In [None]:
plot_category(df, 'revenue', save=False)

In [None]:
df['revenue'] = np.where(df['revenue'] > 9999, 9999, df['revenue'])
df['revenue'] = np.where(df['revenue'] < 4999, 4999, df['revenue'])

plot_category(df, 'revenue', save=False)

# Create New Feature With Discount Applied

`добавить столбец с итоговой суммой покупки с учетом применения
промокода на скидку 10%`

In [None]:
df['final_price'] = np.where(df['promo_code'] == 1, df['revenue'] * 0.9, df['revenue'])

# Create New Feaute: Time of the Day

`добавить столбец с указанием времени суток визита (утро 06:00-09:59,
день 10:00-16:59, вечер 17:00-21:59, ночь 22:00-05:59)`

In [None]:
def get_time_of_day(hour):
   if 6 <= hour < 10:
       return 'morning'
   elif 10 <= hour < 17:
       return 'day'  
   elif 17 <= hour < 22:
       return 'evening'
   else:
       return 'night'

df['time_of_day'] = df['session_start'].dt.hour.map(get_time_of_day)

# Create New Feature: Did User Purchase

`добавить столбец “payer” с информацией о том, является ли
пользователь платящим или нет.`

In [None]:
df['sold'] = df['revenue'].notna().astype(int)


# Inspect Data Again

In [None]:
inspect_data(df)

# Save Changes

In [None]:
df.to_csv('data_part_2.csv', index=False)