In [59]:
import numpy as np, pandas as pd, os, gc, seaborn as sns
from sklearn import preprocessing
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression

In [60]:
os.listdir('../data/')

['test_transaction.csv',
 'train_identity.csv',
 'test_identity.csv',
 'sample_submission.csv',
 'train_transaction.csv']

In [None]:
%%time
train_identity = pd.read_csv("../data/train_identity.csv")
train_transaction = pd.read_csv("../data/train_transaction.csv")
test_transaction = pd.read_csv("../data/test_transaction.csv")
test_identity = pd.read_csv("../data/test_identity.csv")

In [None]:
print('Dimensions of the Train Identity set:',train_identity.shape)
print('Dimensions of the Train transaction set:',train_transaction.shape)
print('Dimensions of the Test transaction set:',test_transaction.shape)
print('Dimensions of the Test Identity set:',test_identity.shape)

In [None]:
train_identity.head()

In [None]:
train_transaction.head()

In [None]:
%%time
# Merge both the transaction and identity by left
train_data=pd.merge(train_transaction,train_identity,how="left",on="TransactionID")
test_data=pd.merge(test_transaction,test_identity,how="left",on="TransactionID")
dicti={}
for i in range(1,10):
    dicti.update({'id-0'+str(i):'id_0'+str(i)})
for i in range(10,39):
    dicti.update({'id-'+str(i):'id_'+str(i)})
test_data=test_data.rename(columns=dicti)
del dicti

In [None]:
# Print Shapes
print("Train Dataset shape: ", train_data.shape)
print("Test Dataset shape: ", test_data.shape)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
del train_identity
del train_transaction
del test_transaction
del test_identity

In [None]:
%%time
# Explore Categorical features
print('Training set:')
l1=[];l2=[];
for _ in train_data.columns:
    if train_data[_].dtypes == 'object' :
        value = len(train_data[_].unique())
        l1.append(_)
        l2.append(value)
frame=pd.DataFrame(np.column_stack((np.array(l1),np.array(l2))),columns=['Column_Name','Category_Count'])
frame

In [None]:
%%time 
print('Test set:')
l1=[];l2=[];
for _ in test_data.columns:
    if test_data[_].dtypes == 'object' :
        value = len(test_data[_].unique())
        l1.append(_)
        l2.append(value)
frame=pd.DataFrame(np.column_stack((np.array(l1),np.array(l2))),columns=['Column_Name','Category_Count'])
del l1,l2
frame

In [None]:
perc_fraud = len(train_data[train_data['isFraud']==1].index)*100/train_data.shape[0]
print("Percentage of Fradaulent records in dataset {:.2f}".format(perc_fraud) )

In [None]:
# Check missing data - Many Columns have more than 50% NA/Null records
def missing_data(df) :
    count = df.isnull().sum()
    percent = (df.isnull().sum()) / (df.isnull().count()) * 100
    total = pd.concat([count, percent], axis=1, keys = ['Count', 'Percent'])
    types = []
    for col in df.columns :
        dtypes = str(df[col].dtype)
        types.append(dtypes)
    total['dtypes'] = types
    
    return np.transpose(total)

In [None]:
missing_data(train_data)

In [None]:
missing_data(test_data)

In [None]:
percentage=70

In [None]:
#Here I have dropped columns who has NULL values more than given percentage
null_percent = train_data.isnull().sum()/train_data.shape[0]*100

cols_to_drop = np.array(null_percent[null_percent > percentage].index)

cols_to_drop

In [None]:
# Dropping Columns
train_data = train_data.drop(cols_to_drop, axis=1)
test_data = test_data.drop(cols_to_drop,axis=1)

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
### Fill NaNs

train_data = train_data.fillna(train_data._get_numeric_data().mean())
test_data = test_data.fillna(test_data._get_numeric_data().mean())

In [54]:
train_data = train_data.fillna(1)
test_data = test_data.fillna(0)

In [55]:
train_y = train_data['isFraud']
train_X = train_data.drop('isFraud', axis=1)

In [56]:
# Label Encoding for categorical variables.
for _ in train_X.columns:
    if train_X[_].dtype=='object' or test_data[_].dtype=='object': 
        le = preprocessing.LabelEncoder()
        le.fit(list(train_X[_].values) + list(test_data[_].values))
        train_X[_] = le.transform(list(train_X[_].values))
        test_data[_] = le.transform(list(test_data[_].values))

# Change cell from markdown to code Run only if memory issue arises 
import ray
ray.shutdown()
ray.init(memory=10737418240, object_store_memory=10737418240)

In [57]:
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
submission = pd.read_csv('../data/sample_submission.csv',index_col='TransactionID')
submission['isFraud'] = logreg.predict_proba(test_data)[:,1]
submission.to_csv('Logreg_submissionp.csv')
submission.head()

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0.042166
3663550,0.042167
3663551,0.042168
3663552,0.042166
3663553,0.042165
