In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.width', 150)
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
sns.set()

In [2]:
df = pd.read_csv('data.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
print(df.shape)
print(df.columns.tolist())

(1012, 15)
['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day', 'order_dt', 'revenue', 'payment_type', 'promo_code']


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012 entries, 0 to 1011
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             1012 non-null   int64  
 1   region              999 non-null    object 
 2   device              999 non-null    object 
 3   channel             999 non-null    object 
 4   session_start       1012 non-null   object 
 5   session_end         1012 non-null   object 
 6   sessiondurationsec  1012 non-null   float64
 7   session_date        1012 non-null   object 
 8   month               1012 non-null   int64  
 9   day                 1012 non-null   int64  
 10  hour_of_day         1012 non-null   int64  
 11  order_dt            285 non-null    object 
 12  revenue             285 non-null    float64
 13  payment_type        285 non-null    object 
 14  promo_code          285 non-null    float64
dtypes: float64(3), int64(4), object(8)
memory usage: 118.7+

In [4]:
dates = ['session_start', 'session_end', 'session_date', 'order_dt']
df[dates] = df[dates].astype('datetime64[ns]')

df['sessiondurationsec'] = df['sessiondurationsec'].round().astype(int)
df['promo_code'] = df['promo_code'].round()

In [5]:
corrections = {
    'region': {
        'Frаnce': 'France',
        'Frаncе': 'France',
        'Franсe': 'France',
        'Unjted States': 'United States',
        'germany': 'Germany',
        'UК': 'UK'
    },
    'device': {'android': 'Android'},
    'channel': {'контексная реклама': 'контекстная реклама'}
}
df.replace(corrections, inplace=True)

In [6]:
duplicates = df.duplicated().sum()
print('Number of complete duplicates:', duplicates)
df.drop_duplicates(inplace=True)

Number of complete duplicates: 3


In [7]:
# Handle missing values
r_d_c = ['region', 'device', 'channel']
# Fill NaN for users who visited twice
df[r_d_c] = df.groupby('user_id')[r_d_c].transform(lambda x: x.ffill())

# Fill NaN with respective mode values
df.fillna({col: df[col].mode()[0] for col in r_d_c}, inplace=True)

### `добавить столбец с итоговой суммой покупки с учетом применения промокода на скидку 10%`

In [8]:
df['final_price'] = np.where(df['promo_code'] == 1, df['revenue'] * 0.9, df['revenue'])

### `добавить столбец с указанием времени суток визита (утро 06:00-09:59, день 10:00-16:59, вечер 17:00-21:59, ночь 22:00-05:59)`

In [9]:
def get_time_of_day(hour):
   if 6 <= hour < 10:
       return 'morning'
   elif 10 <= hour < 17:
       return 'day'  
   elif 17 <= hour < 22:
       return 'evening'
   else:
       return 'night'

df['time_of_day'] = df['session_start'].dt.hour.map(get_time_of_day)

### `добавить столбец “payer” с информацией о том, является ли пользователь платящим или нет.`

In [11]:
df['payer'] = df['revenue'].notna().astype(int)
df['revenue'] = df['revenue'].fillna(0).astype(int)

In [12]:
df.to_csv('ecom_go.csv', index=False)