In [32]:
# Data Manipulation
import numpy as numpy
import pandas as pd

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

In [33]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.float_format", lambda x: '%.3f' % x)

In [34]:
data = pd.read_csv("C:/Users/MONSTER/Desktop/train.csv")

def get_season(month):
    if month in [1, 2, 3]:
        return 'Q1'
    elif month in [4, 5, 6]:
        return 'Q2'
    elif month in [7, 8, 9]:
        return 'Q3'
    else: # 10, 11, 12
        return 'Q4'

data['month_id'] = data['month_id'].astype(str)
data['year'] = data['month_id'].str[:4].astype(int)
data['month'] = data['month_id'].str[4:].astype(int)
data['season'] = data['month'].apply(get_season)
data['month_id'] = pd.to_datetime(data['month_id'], format='%Y%m')


column_order = ['month_id', 'merchant_id', 'year', 'month', 'season', 'merchant_source_name', 'settlement_period', 'working_type', 'mcc_id', 'merchant_segment', 'net_payment_count']
data = data[column_order]
data = data.sort_values(by='month_id').reset_index(drop=True)

In [35]:
data.head(10)

Unnamed: 0,month_id,merchant_id,year,month,season,merchant_source_name,settlement_period,working_type,mcc_id,merchant_segment,net_payment_count
0,2020-01-01,merchant_66740,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 6,mcc_130,Segment - 4,6
1,2020-01-01,merchant_12444,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 2,mcc_153,Segment - 4,3
2,2020-01-01,merchant_40154,2020,1,Q1,Merchant Source - 2,Settlement Period - 1,Working Type - 5,mcc_168,Segment - 4,3
3,2020-01-01,merchant_33179,2020,1,Q1,Merchant Source - 3,Settlement Period - 3,Working Type - 2,mcc_25,Segment - 2,787
4,2020-01-01,merchant_16977,2020,1,Q1,Merchant Source - 2,Settlement Period - 1,Working Type - 6,mcc_31,Segment - 4,5
5,2020-01-01,merchant_25991,2020,1,Q1,Merchant Source - 2,Settlement Period - 1,Working Type - 6,mcc_110,Segment - 4,8
6,2020-01-01,merchant_92,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 6,mcc_73,Segment - 4,3
7,2020-01-01,merchant_13117,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 6,mcc_126,Segment - 4,9
8,2020-01-01,merchant_43413,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 6,mcc_42,Segment - 4,14
9,2020-01-01,merchant_26670,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 2,mcc_166,Segment - 4,3


In [36]:
data.groupby(['year', 'season']).agg({'net_payment_count': ['sum', 'mean', 'median', 'min', 'max'], 'merchant_id': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,net_payment_count,net_payment_count,net_payment_count,net_payment_count,net_payment_count,merchant_id
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,median,min,max,count
year,season,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2020,Q1,2319586,200.535,5.0,-65,130080,11567
2020,Q2,3565948,259.398,6.0,-11,465672,13747
2020,Q3,4868705,310.049,6.0,0,626546,15703
2020,Q4,6506261,363.62,6.0,-4,953154,17893
2021,Q1,7028182,352.696,6.0,-5,984251,19927
2021,Q2,7973687,381.024,6.0,-8,1160429,20927
2021,Q3,7585057,373.98,5.0,-4,737260,20282
2021,Q4,9856638,457.172,6.0,-62,897476,21560
2022,Q1,7808273,361.394,5.0,-830,717256,21606
2022,Q2,9618174,450.373,5.0,-312,585755,21356


In [37]:
# Segment column
data.loc[data['merchant_segment'].isin(["Segment - 1", "Segment - 2", "Segment - 3"]), 'merchant_segment'] = "otherSegments123"
data.loc[data['merchant_segment'] == "Segment - 4", 'merchant_segment'] = "StandartSegment4"
# negative values in target 
data.loc[data['net_payment_count'] < 0, 'net_payment_count'] = 0
# mcc_id to integer
data['mcc_id'] = data['mcc_id'].str.replace('mcc_', '').astype(int)
# Working type
data.loc[data['working_type'].isin(["Working Type - 1", "Working Type - 3", "Working Type - 4"]), 'working_type'] = "otherWorkingTypes134"
# settlement_period
data.loc[data['settlement_period'].isin(["Settlement Period - 2", "Settlement Period - 3"]), 'settlement_period'] = "otherSettlementPeriods23"


In [38]:
data.head()

Unnamed: 0,month_id,merchant_id,year,month,season,merchant_source_name,settlement_period,working_type,mcc_id,merchant_segment,net_payment_count
0,2020-01-01,merchant_66740,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 6,130,StandartSegment4,6
1,2020-01-01,merchant_12444,2020,1,Q1,Merchant Source - 1,Settlement Period - 1,Working Type - 2,153,StandartSegment4,3
2,2020-01-01,merchant_40154,2020,1,Q1,Merchant Source - 2,Settlement Period - 1,Working Type - 5,168,StandartSegment4,3
3,2020-01-01,merchant_33179,2020,1,Q1,Merchant Source - 3,otherSettlementPeriods23,Working Type - 2,25,otherSegments123,787
4,2020-01-01,merchant_16977,2020,1,Q1,Merchant Source - 2,Settlement Period - 1,Working Type - 6,31,StandartSegment4,5


In [43]:
data["settlement_period"].value_counts()

Settlement Period - 1       268906
otherSettlementPeriods23     22236
Name: settlement_period, dtype: int64