# Imports and loading dataset 

In [0]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [0]:
df = pd.read_excel("ANZ synthesised transaction dataset.xlsx")

### Modifying data to obtain salaries for each customer

In [3]:
df_salaries = df[df["txn_description"]=="PAY/SALARY"].groupby("customer_id").mean()
df_salaries.head()

Unnamed: 0_level_0,card_present_flag,merchant_code,balance,age,amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CUS-1005756958,,0.0,4718.665385,53,970.47
CUS-1117979751,,0.0,11957.202857,21,3578.65
CUS-1140341822,,0.0,5841.72,28,1916.51
CUS-1147642491,,0.0,8813.467692,34,1711.39
CUS-1196156254,,0.0,23845.717143,34,3903.73


In [0]:
salaries = []

for customer_id in df["customer_id"]:
    salaries.append(int(df_salaries.loc[customer_id]["amount"]))
    
df["annual_salary"] = salaries

In [5]:
df_cus = df.groupby("customer_id").mean()
df_cus.head()

Unnamed: 0_level_0,card_present_flag,merchant_code,balance,age,amount,annual_salary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUS-1005756958,0.8125,0.0,2275.852055,53,222.862603,970
CUS-1117979751,0.826923,0.0,9829.929,21,339.8437,3578
CUS-1140341822,0.815385,0.0,5699.21225,28,212.6325,1916
CUS-1147642491,0.75,0.0,9032.841186,34,245.600169,1711
CUS-1196156254,0.785276,0.0,22272.433755,34,147.145796,3903


# Predictive Analytics

### Linear Regression

In [0]:
N_train = int(len(df_cus)*0.8)
X_train = df_cus.drop("annual_salary", axis=1).iloc[:N_train]
Y_train = df_cus["annual_salary"].iloc[:N_train]
X_test = df_cus.drop("annual_salary", axis=1).iloc[N_train:]
Y_test = df_cus["annual_salary"].iloc[N_train:]

In [0]:
linear_reg = LinearRegression()

In [8]:
linear_reg.fit(X_train, Y_train)
linear_reg.score(X_train, Y_train)

0.23295376366257825

In [9]:
linear_reg.predict(X_test)

array([1993.98473311, 2867.39066481, 1944.95959591, 1806.85984885,
       2226.35045442, 2075.34697175, 1813.02987337, 5388.67435983,
       1902.35351608, 2191.90445145, 1713.48134178, 2854.40519949,
       2094.77781158, 3815.34342881, 2249.92922822, 1768.80816189,
       2095.02988288, 1515.18425875, 1782.72752537, 2481.2898546 ])

In [10]:
linear_reg.score(X_test, Y_test)

-0.31694234980747504

### Decision Tree - Classification and Regression

In [0]:
df_cat = df[["txn_description", "gender", "age", "merchant_state", "movement"]]

In [12]:
pd.get_dummies(df_cat).head()

Unnamed: 0,age,txn_description_INTER BANK,txn_description_PAY/SALARY,txn_description_PAYMENT,txn_description_PHONE BANK,txn_description_POS,txn_description_SALES-POS,gender_F,gender_M,merchant_state_ACT,merchant_state_NSW,merchant_state_NT,merchant_state_QLD,merchant_state_SA,merchant_state_TAS,merchant_state_VIC,merchant_state_WA,movement_credit,movement_debit
0,26,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1
1,26,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1
2,38,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1
3,40,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1
4,26,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1


In [0]:
N_train = int(len(df)*0.8)
X_train = pd.get_dummies(df_cat).iloc[:N_train]
Y_train = df["annual_salary"].iloc[:N_train]
X_test = pd.get_dummies(df_cat).iloc[N_train:]
Y_test = df["annual_salary"].iloc[N_train:]

#### Classification

In [0]:
decision_tree_class = DecisionTreeClassifier()

In [15]:
decision_tree_class.fit(X_train, Y_train)
decision_tree_class.score(X_train, Y_train)

0.7882499481004774

In [16]:
decision_tree_class.predict(X_test)

array([1013, 1043, 4132, ..., 4054, 1043,  996])

In [17]:
decision_tree_class.score(X_test, Y_test)

0.755085097550851

#### Regression

In [0]:
decision_tree_reg = DecisionTreeRegressor()

In [19]:
decision_tree_reg.fit(X_train, Y_train)
decision_tree_reg.score(X_train, Y_train)

0.7468978726536879

In [20]:
decision_tree_reg.predict(X_test)

array([1226.42857143, 1043.        , 4132.        , ..., 3345.04761905,
       1043.        , 1626.        ])

In [21]:
decision_tree_reg.score(X_test, Y_test)

0.6730993096697413