In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns



import datetime

from sklearn.preprocessing import LabelEncoder


warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [None]:
train_iden = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
train_trans = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")


In [None]:
test_iden= pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
test_trans = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")

In [None]:
train_df=pd.merge(train_trans,train_iden,how="left",on="TransactionID")
test_df=pd.merge(test_trans,test_iden,how="left",on="TransactionID")

# EDA 

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(tt.T)

In [None]:
missing_train = missing_data(train_df)
missing_train

In [None]:
missing_test = missing_data(test_df)
missing_train

In [None]:
sns.countplot(train_df['isFraud'])
print("Count Plot for Fraudulent vs legitimate transaction") ## Imbalanced Dataset
train_df['isFraud'].value_counts(normalize=True) * 100

In [None]:
startdate = datetime.datetime.strptime("2017-12-01","%Y-%m-%d")

train_trans["TransactionDT"] = train_trans["TransactionDT"].apply(
                                   lambda x : (startdate + datetime.timedelta(seconds = x)))
test_trans["TransactionID"] = test_trans["TransactionID"].apply(
                                   lambda x : (startdate + datetime.timedelta(seconds = x)))


In [None]:
fig,axes = plt.subplots(1,1,figsize = (16,6))
train_trans.set_index("TransactionDT").resample("D").mean()["isFraud"].plot(ax=axes).set_ylabel("isFraud mean",fontsize=14)
axes.set_title("Mean Fraud Transactions per day",fontsize=16)

In [None]:
fig, ax1 = plt.subplots(figsize=(16, 6))
train_trans.set_index('TransactionDT').resample('D').mean()['isFraud'].plot(ax=ax1, color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.set_ylabel('isFraud mean', color='blue', fontsize=14)

ax2 = ax1.twinx()
train_trans["TransactionDT"].dt.floor("D").value_counts().sort_index().plot(ax =ax2,color = "tab:orange")
ax2.tick_params(axis = "y",labelcolor = "tab:orange")
ax2.set_ylabel("Number of traing examples",color = "tab:orange",fontsize = 14)

In [None]:
# Lets reduce our ram usage 
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Using  above function to reduce memory usage for Train test df
train_df=reduce_mem_usage(train_df)
test_df=reduce_mem_usage(test_df)

In [None]:
del train_iden
del train_trans
del test_trans
del test_iden

In [None]:
null_percent = train_df.isnull().sum()/train_df.shape[0]*100

cols_to_drop = np.array(null_percent[null_percent > 50].index)

cols_to_drop

In [None]:
train_df = train_df.drop(cols_to_drop, axis=1)
test_df = test_df.drop(cols_to_drop,axis=1)

In [None]:
train_y = train_df['isFraud']
train_X = train_df.drop('isFraud', axis=1)

In [None]:
cols=train_X.columns
num_cols=train_X._get_numeric_data().columns
cat_cols=list(set(cols)-set(num_cols))
print("Numeric Columns:",num_cols)
print("Categoric Columns:",cat_cols)



In [None]:
train_X[cat_cols]=train_X[cat_cols].fillna('empty')
test_df[cat_cols]=test_df[cat_cols].fillna('empty')

train_X[num_cols] = train_X[num_cols].fillna(-999)
test_df[num_cols] = test_df[num_cols].fillna(-999)
  

    

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [None]:
for f in train_X.columns:
    if train_X[f].dtype=='object' or test_df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_X[f].values) + list(test_df[f].values))
        train_X[f] = lbl.transform(list(train_X[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

In [None]:
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

In [None]:
submission = pd.DataFrame()
submission["TransactionID"] = test_df["TransactionID"]
submission['isFraud'] = logreg.predict_proba(test_df)[:,1]


In [None]:
submission.to_csv("mysubmission.csv",index=False)

In [None]:
sample