In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
import os
import statistics
from scipy.stats import levene, shapiro, pointbiserialr, kruskal, ttest_ind, mannwhitneyu
from datetime import datetime
!pip install pingouin
import pingouin as pg




In [2]:
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [3]:
os.chdir('drive/My Drive')

In [5]:
def import_Train():
  Train = pd.read_csv('Train.csv')
  Train['Year'] = Train['transaction_val_dt'].apply(lambda x:int(str(x).split('-')[0]))
  Train['Month'] = Train['transaction_val_dt'].apply(lambda x:int(str(x).split('-')[1]))
  Train['Day'] = Train['transaction_val_dt'].apply(lambda x:int((str(x).split('-')[2].split(' ')[0])))
  return Train

In [6]:
def reindex_train(Train):
  Train = Train.astype({"rem_act_no_dummy": str, "bene_act_no_dummy": str})
  Train['Index'] = Train['rem_act_no_dummy'].str.cat(Train['bene_act_no_dummy'],sep=", ")
  Train = Train.astype({"rem_act_no_dummy": int, "bene_act_no_dummy": int})
  Train = Train.drop(['yearmonth'], axis = 1)
  Train = Train.set_index('Index')
  return Train

In [7]:
def create_train(Train):
  train = Train[~Train.index.duplicated(keep='first')]
  train['ID'] = train.index
  return train

In [8]:
def create_txn_type_num(train, Train):
  Unique_txn_type = Train.groupby('Index').txn_type.unique()
  temp = pd.DataFrame(Unique_txn_type)
  temp['length'] = temp['txn_type'].apply(lambda x:len(x))
  multiple_txn_type = temp[temp.length>1]
  train.replace({'DOMESTIC':1,'CROSS BORDER':0},inplace=True)
  ratios = []
  multiple_index = multiple_txn_type.index
  for i in multiple_index:
    num = 0
    den = 0
    # print(Train.loc[i,'txn_type'].value_counts())
    ratio = Train.loc[i,'txn_type'].value_counts()[0]/(Train.loc[i,'txn_type'].value_counts()[1]+Train.loc[i,'txn_type'].value_counts()[0])
    ratios.append(ratio)
    train.loc[i,'txn_type'] = ratio
  return train

In [10]:
def create_rem_company_ind(train,Train):
  company_ind = pd.DataFrame(Train.groupby('Index').rem_company_ind.unique())
  company_ind.iloc[:,0] = company_ind.iloc[:,0].apply(lambda x:np.sort(x)[0])
  train['rem_company_ind'] = train.ID.apply(lambda x: company_ind.loc[x,'rem_company_ind'])
  return train

In [11]:
def create_avg_txn_amt(train, Train):
  mean = Train.groupby('Index').mean()
  train['Average_txn_amount'] = train.ID.apply(lambda x: mean.loc[x,'txn_amt'])
  return train

In [13]:
def create_var_txn_amount(train,Train):
  var = Train.groupby('Index').var(ddof=0)
  train['var_txn_amount'] = train.ID.apply(lambda x: var.loc[x,'txn_amt'])
  return train

In [14]:
def create_transaction_by_remitter(train,Train):
  rem_count = Train.groupby('rem_act_no_dummy').count()
  train['transaction/remitter'] = train['ID'].apply(lambda x:(train.loc[x,'No_of_transactions']/rem_count.loc[int(x.split(',')[0]),'txn_amt']))
  return train

In [15]:
def create_no_of_txn(train,Train):
  count = Train.groupby('Index').count()
  train['No_of_transactions'] = train.ID.apply(lambda x: count.loc[x,'txn_amt'])
  return train

In [16]:
def create_diff_pair_remitter(train,Train):
  rem_amount = pd.DataFrame(Train.groupby('rem_act_no_dummy').txn_amt.mean())
  train['Diff_pair_remitter'] = train['ID'].apply(lambda x:(train.loc[x,'Average_txn_amount'] - rem_amount.loc[int(x.split(',')[0]),'txn_amt']))
  return train

In [17]:
def create_diff_pair_remitter_var(train,Train):
  # IMP: Need a column - 'var_txn_amount' - var of txn amount of rem ben pair
  rem_var = pd.DataFrame(Train.groupby('rem_act_no_dummy').txn_amt.var(ddof=0))
  rem_var.replace(np.nan,0,inplace = True)
  train['Diff_pair_remitter_var'] = train['ID'].apply(lambda x:(train.loc[x,'var_txn_amount'] - rem_var.loc[int(x.split(',')[0]),'txn_amt']))
  train['Ratio_pair_remitter_var'] = train['ID'].apply(lambda x:(train.loc[x,'var_txn_amount']/rem_var.loc[int(x.split(',')[0]),'txn_amt']))
  return train

In [18]:
def drop_wrong_columns(train):
  train.drop(['txn_amt','transaction_val_dt'],axis=1,inplace=True)
  return train

In [19]:
Train = import_Train()

In [20]:
Train = reindex_train(Train)

In [22]:
train = create_train(Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
train.head()

Unnamed: 0_level_0,txn_refr_key_dummy,rem_company_id_dummy,rem_act_no_dummy,bene_act_no_dummy,txn_amt,transaction_val_dt,txn_type,rem_company_ind,payroll_ind,Year,Month,Day,ID
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"5000000, 8000000",1,1000000000,5000000,8000000,1.45873,2020-09-17,CROSS BORDER,Consumer,0,2020,9,17,"5000000, 8000000"
"5000200, 8000200",4,1000000020,5000200,8000200,1.269716,2020-09-30,CROSS BORDER,Communications,0,2020,9,30,"5000200, 8000200"
"5000300, 8000300",5,1000000030,5000300,8000300,1.190682,2020-09-22,CROSS BORDER,Industrials,0,2020,9,22,"5000300, 8000300"
"5000500, 8000500",7,1000000050,5000500,8000500,5.657271,2020-09-29,CROSS BORDER,Technology,0,2020,9,29,"5000500, 8000500"
"5000200, 8000600",8,1000000020,5000200,8000600,1.239628,2020-09-29,CROSS BORDER,Communications,0,2020,9,29,"5000200, 8000600"


In [24]:
train = create_txn_type_num(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [25]:
train.txn_type.value_counts()

0.000000    194545
1.000000    188188
0.500000        18
0.750000        10
0.666667         9
0.857143         5
0.833333         5
0.800000         4
0.875000         3
0.600000         2
0.909091         2
0.777778         1
0.901961         1
0.941176         1
0.615385         1
Name: txn_type, dtype: int64

In [26]:
train = create_rem_company_ind(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [27]:
train = create_avg_txn_amt(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
train = create_no_of_txn(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
train = create_transaction_by_remitter(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
train = create_diff_pair_remitter(train,Train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
train = create_var_txn_amount(train,Train)

In [None]:
train = create_diff_pair_remitter_var(train,Train)

In [None]:
train = drop_wrong_columns(train)

In [None]:
train.head()

In [None]:
train.to_csv('Ashwin.csv')

In [None]:
from google.colab import files
files.download('Ashwin.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>