In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import sys

In [None]:
ttc = pd.read_csv('train_transaction.csv', low_memory=False)
tic = pd.read_csv('train_identity.csv', low_memory=False)
tstc = pd.read_csv('test_transaction.csv', low_memory=False)
tstic = pd.read_csv('test_identity.csv', low_memory=False)

In [None]:
def reduce_mem_usage(df):
    numv = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    for col in df.columns:
        vt = df[col].dtypes
        if vt in numv:
            max_c = df[col].max()
            min_c = df[col].min()
            if str(vt)[:3] == 'int':
                if min_c > np.iinfo(np.int8).min and max_c < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif min_c > np.iinfo(np.int16).min and max_c < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif min_c > np.iinfo(np.int32).min and max_c < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif min_c > np.iinfo(np.int64).min and max_c < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if min_c > np.finfo(np.float16).min and max_c < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif min_c > np.finfo(np.float32).min and max_c < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif min_c > np.finfo(np.float64).min and max_c < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
ttc = reduce_mem_usage(ttc)
tic = reduce_mem_usage(tic)
tstc = reduce_mem_usage(tstc)
tstic = reduce_mem_usage(tstic)

In [None]:
train = pd.merge(ttc, tic, on = 'TransactionID', how = 'left')
test = pd.merge(tstc, tstic, on = 'TransactionID', how = 'left')

In [None]:
del ttc, tic, tstc, tstic

In [None]:
miss_data = pd.isnull(train).sum().sort_values(ascending=False)
miss_per = (miss_data/len(train))*100
missing_data = pd.concat(objs = [miss_data, miss_per], keys = ['Columns','Missing values percentage'], axis = 1)

In [None]:
def delnullcol(dt):
    nullcol = [col for col in dt.columns if dt[col].isnull().sum()/dt.shape[0] >= 0.9]
    return nullcol

In [None]:
rep_vals = [col for col in train.columns if train[col].value_counts(dropna = False, normalize = True).values[0] >= 0.9]
cols=[]
for col in rep_vals:
    cols.append(train[col].value_counts(dropna = False).values[0])

In [None]:
def repcols(dt):
    rep_vals = [col for col in dt.columns if dt[col].value_counts(dropna = False, normalize = True).values[0] >= 0.9]
    return rep_vals

In [None]:
def useless_cols(dt, exep):
    null_cols = delnullcol(dt)
    print("More than 90% null: " + str(len(null_cols)))
    too_many_repeated = repcols(dt)
    print("More than 90% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(null_cols + too_many_repeated))
    cols_to_drop.remove(exep)
    return cols_to_drop

In [None]:
cols_to_drop = useless_cols(train, 'isFraud')

In [None]:
def find_Major_values(dt, threshold):
    Major_values = []
    t=dt.value_counts(dropna = True, normalize = True)
    for i in range(len(t)):
        if t.values[i] >= threshold:
            Major_values.append(t.values[i])
    return Major_values

In [None]:
def find_Major_Devices(Major_values,dt):
    Major_Devices = []
    t = dt.value_counts(dropna = True, normalize = True)
    for i in Major_values:
        for j in t.items():
            if j[1] == i:
                Major_Devices.append(j[0])
    return Major_Devices

In [None]:
def find_plot(Major_Devices,d,dt):
    plothis=[]
    for i in range(len(Major_Devices)):
        plothis.append(d.loc[dt == Major_Devices[i]])
    if len(plothis) == 0:
        return 10
    else:
        plothis = pd.concat(objs = [i for i in plothis], axis = 0)
        return plothis

In [None]:
sns.set(style = "whitegrid")

In [None]:
plt.figure(figsize=(100,25))
p = sns.barplot(x = 'Columns', y = 'Missing values percentage', data = missing_data)
p.set_xticklabels(list(train.columns))
p

In [None]:
plt.figure(figsize=(100,25))
p2 = sns.barplot(x = rep_vals, y = cols)
plt.title("Columns with most repetetive data")
p2.set(xlabel='Columns', ylabel='Number of replitions')
p2

In [None]:
train.head()

In [None]:
amnt = sns.barplot(x = train['isFraud'], y = train['TransactionAmt'], data = train)
plt.title("Amount V Fraud")
amnt.set_xticklabels(['Not Fraud','Fraud'])
amnt.set(xlabel='Transaction Amount')
amnt

In [None]:
sns.countplot(train['ProductCD'], hue='isFraud', data=train)

In [None]:
for i in range(1,7):
    mv1 = find_Major_values(train['card'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['card'+str(i)])
    plothis1 = find_plot(md1, train, train['card'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['card'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of card number "+str(i))
        p4.set(xlabel='card data of card number '+str(i), ylabel='Count')
        p4

In [None]:
for i in range(1, 15):
    mv1 = find_Major_values(train['C' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['C' + str(i)])
    plothis1 = find_plot(md1, train, train['C' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['C' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of C" + str(i))
        p4.set(xlabel='C data of C' + str(i), ylabel='Count')
        p4

In [None]:
for i in range(1, 16):
    mv1 = find_Major_values(train['D' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['D' + str(i)])
    plothis1 = find_plot(md1, train, train['D' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['D' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of D" + str(i))
        p4.set(xlabel='D data of D' + str(i), ylabel='Count')
        p4

In [None]:
for i in range(1, 10):
    mv1 = find_Major_values(train['M' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['M' + str(i)])
    plothis1 = find_plot(md1, train, train['M' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['M' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of M" + str(i))
        p4.set(xlabel='M data of M' + str(i), ylabel='Count')
        p4

In [None]:
for i in range(1,10):
    mv1 = find_Major_values(train['id_0'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['id_0'+str(i)])
    plothis1 = find_plot(md1, train, train['id_0'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['id_0'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of an id_"+str(i))
        p4.set(xlabel='id data of id_'+str(i), ylabel='Count')
        p4

In [None]:
for i in range(10,39):
    mv1 = find_Major_values(train['id_'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['id_'+str(i)])
    plothis1 = find_plot(md1, train, train['id_'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['id_'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of id_"+str(i))
        p4.set(xlabel='id data of id_'+str(i), ylabel='Count')
        p4

In [None]:
sns.countplot(train['DeviceType'], hue='isFraud', data=train)

In [None]:
mv = find_Major_values(train['DeviceInfo'], 0.1)
md = find_Major_Devices(mv, train['DeviceInfo'])
plothis = find_plot(md, train, train['DeviceInfo'])

p3 = sns.countplot(x = plothis['DeviceInfo'], hue = plothis['isFraud'], data= plothis)
plt.title("Data analysis of majorly used devices")
p3.set(xlabel='Devices', ylabel='Count')
p3

In [None]:
train = train.drop(cols_to_drop, axis=1)

In [None]:
train = train.replace(np.inf,999)
test = test.replace(np.inf,999)

train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])

In [None]:
train.head()

In [None]:
y_train = train['isFraud']
train = pd.get_dummies(train)
X_train = train.drop('isFraud', axis=1)
X_train = X_train.fillna(0)

In [None]:
del train

In [None]:
q = 0
scaler = MinMaxScaler()
for col in X_train.columns:
    a = np.array(X_train[col])
    a = a.reshape(-1,1)
    X_train[col] = scaler.fit_transform(a)
    if q >= 100:
        break
    else:
        q+=1
        continue

In [None]:
del a

In [None]:
vectorizer = CountVectorizer()
from sklearn.externals import joblib
joblib.dump(x, 'dataset.joblib')

In [None]:
joblib.dump(y_train, 'datasety.joblib')

In [None]:
del X_train, y_train

In [None]:
x_train = joblib.load('dataset.joblib')

In [None]:
y_train = joblib.load('datasety.joblib')

In [None]:
lr = LogisticRegression(solver="liblinear", random_state=42)
lr.fit(x_train, y_train[:np.shape(x_train)[0]])

In [None]:
score = cross_val_score(lr, x_train, y_train[:np.shape(x_train)[0]], cv=3, verbose=3)
score.mean()

In [None]:
rf = RandomForestClassifier(n_estimators=1000, bootstrap=False, max_features=0.33, n_jobs=4)
rf.fit(x_train, y_train[:np.shape(x_train)[0]])

In [None]:
score = cross_val_score(lr, x_train, y_train[:np.shape(x_train)[0]], cv=3, verbose=3)
score.mean()