<a href="https://colab.research.google.com/github/bornhansie/house-price/blob/master/Hannes_Bornman_EDSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv('drive/My Drive/Zindi/training.csv')
test = pd.read_csv('drive/My Drive/Zindi/test.csv')
test_id = test['TransactionId']
y = train['FraudResult'].reset_index(drop=True)

In [0]:
train.drop(['CountryCode','BatchId', 'AccountId', 'SubscriptionId', 'ChannelId', 'TransactionId', 'CustomerId', 'ProviderId', 'ProductId', 'TransactionStartTime', 'FraudResult'], axis = 1, inplace = True)
test.drop(['CountryCode','BatchId', 'AccountId', 'SubscriptionId', 'ChannelId', 'TransactionId', 'CustomerId', 'ProviderId', 'ProductId', 'TransactionStartTime'], axis = 1, inplace = True)

In [68]:
features = pd.concat([train, test], sort=False).reset_index(drop=True)

from scipy.stats import skew
numeric_feats = features.dtypes[features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

Unnamed: 0,Skew
Value,54.047119
Amount,53.568064
PricingStrategy,1.636453


In [0]:
features = pd.get_dummies(features)

In [70]:
train_set = features[:len(y)]
test_set = features[len(train_set):]
train_set.head()

Unnamed: 0,Amount,Value,PricingStrategy,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,1000.0,1000,2,1,1,0,0,0,0,0,0,0,0,0
1,-20.0,20,2,1,0,0,1,0,0,0,0,0,0,0
2,500.0,500,2,1,1,0,0,0,0,0,0,0,0,0
3,20000.0,21800,2,1,0,0,0,0,0,0,0,0,0,1
4,-644.0,644,2,1,0,0,1,0,0,0,0,0,0,0


In [71]:
test_set.head()

Unnamed: 0,Amount,Value,PricingStrategy,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
95662,1000.0,1000,4,1,1,0,0,0,0,0,0,0,0,0
95663,2000.0,2000,2,1,0,0,1,0,0,0,0,0,0,0
95664,-50.0,50,2,1,0,0,1,0,0,0,0,0,0,0
95665,3000.0,3000,4,1,1,0,0,0,0,0,0,0,0,0
95666,-60.0,60,2,1,0,0,1,0,0,0,0,0,0,0


In [0]:
X = train_set
test_X = test_set

In [0]:
X_train = X
X_test = test_X
y_train = y

In [0]:
X = pd.concat([X_train, y_train], axis = 1)

In [0]:
non_fraud = X[X['FraudResult']==0]
fraud = X[X['FraudResult']==1]

In [0]:
samples = len(non_fraud)
sampled_fraud = resample(fraud,replace = True, n_samples=samples)

In [0]:
sampled_fraud = pd.concat([non_fraud, sampled_fraud])

In [0]:
y_train = sampled_fraud['FraudResult']
X_train = sampled_fraud.drop('FraudResult', axis = 1)

In [91]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
pred= lr.predict(X_train)

In [93]:
from sklearn import metrics

metrics.f1_score(y_train, pred)

0.6666666666666666

In [94]:
df = pd.DataFrame({'TransactionId': test_id, 'FraudResult':lr.predict(X_test)})
df.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,1
1,TransactionId_95109,1
2,TransactionId_47357,1
3,TransactionId_28185,1
4,TransactionId_22140,1


In [0]:
df.to_csv('Hannes_EDSA_submission_6th.csv', index=False)

In [96]:
df.shape

(45019, 2)

In [97]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 400,
                             n_jobs =10,
                             bootstrap = True,
                             max_depth = 10,
                             min_samples_split = 10).fit(X_train, y_train)

rfc_pred = rfc.predict(X_train)

metrics.f1_score(y_train, rfc_pred)

0.9929184022243012

In [99]:
df = pd.DataFrame({'TransactionId': test_id, 'FraudResult':rfc.predict(X_test)})
df.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0
1,TransactionId_95109,0
2,TransactionId_47357,0
3,TransactionId_28185,0
4,TransactionId_22140,0


In [0]:
df.to_csv('Hannes_EDSA_final.csv', index=False)