## Notes

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import scikitplot
import datetime
import seaborn as sns
import datetime
import random
import uuid
sns.set()

import os, sys
p = os.path.join(os.path.dirname('__file__'), '..')
sys.path.append(p)

## Dataset

In [None]:
def get_data():
    df = pd.read_csv('../data/fakedata.csv', index_col='id')
    df['id'] = df.index
    df = df.rename(columns={'class':'label'})
    return df

df = get_data()
df.head()

In [None]:
categorical = ['country', 'device']
identifiers = ['id', 'user', 'name']
numerical = ['amount']
target = 'label'

In [None]:
# Shuffle and sample 50%
features = df.sample(frac=.5)
df.head()

## Summary Stats

In [None]:
df.info()
df.describe()
df.nunique()
df.isnull().sum() # null count
df.label.value_counts()
df.describe(include='O') # Categorical columns

## Time


### Time Features (Hour, Min, Day)

In [None]:
df['datetime'] = df.time.apply(datetime.datetime.fromtimestamp)
df['time_hour'] = df.datetime.dt.round('H')
df['time_day'] = df.datetime.dt.round('D')
df['hour'] = df.datetime.dt.hour
df['minute'] = df.datetime.dt.minute
df['daywk'] = df.datetime.dt.dayofweek
df.head()

## Plotting

### Time Series

In [None]:
grouped = df.groupby('time_day')
rates = grouped[target].sum() / grouped[target].count()
rates.plot(kind='line');

### Categorical (Bar)

In [None]:
for c in categorical:
    grouped = df.groupby(c)
    rates = grouped[target].sum() / grouped[target].count()
    print(rates)
    rates.plot(kind='bar')
    plt.show()

### Categorical (Count)

In [None]:
for c in categorical:
    print(df[c].value_counts())
    sns.countplot(data=df, x=c, hue=target)
    plt.title(c)
    plt.show()

### Histograms (Overlap)

In [None]:
q = df.amount.quantile(q=.3)  # First grab quartiles
for c in numerical:
    sns.distplot(df[(df[target] == 0) & (df.amount < q)][c], bins=20, label='0')
    sns.distplot(df[(df[target] == 1) & (df.amount < q)][c], bins=20, label='1')
    plt.legend()
    plt.title(c)

In [None]:
fig = plt.figure()
ax = fig.subplots(nrows=1, ncols=1)
df[df.label == 0].hist(column='amount', bins=20, ax=ax, label='0', alpha=.5)
df[df.label == 1].hist(column='amount', bins=20, ax=ax, label='1', alpha=.6)
plt.legend();

### Histograms (Side-by-side)

In [None]:
g = sns.FacetGrid(df, col='label')
g.map(plt.hist, 'amount', bins=20, alpha=.8);

In [None]:
# Facetgrid with value combos + categories
grid = sns.FacetGrid(df, col='label', row='country')
grid.map(plt.hist, 'amount', bins=20, alpha=.8)
grid.add_legend();

## Covariance

### Correlation Matrix and Heatmap

In [None]:
# Numerical and Time only
corr = df.corr()
fig = plt.figure(figsize=(14,14))
sns.heatmap(corr, annot=True);

### Most Correlated Features

In [None]:
corr.nlargest(n=5, columns='class')['class']
corr.nsmallest(n=5, columns='class')['class']

## Aggregation

### Basic

In [None]:
# Multiple statistics
df[['amount', 'daywk']].describe().T

In [None]:
# Count occurances
df['country'].value_counts()

In [None]:
# Mode
df[['device', 'country']].mode().T

### Custom Aggregations

In [None]:
aggregations = {
    'class': {
        'pos': 'sum',
        'total': 'count',
        'rate': lambda x: x.sum() / x.count()
    },
    'amount': {
        'avg_amt': 'mean',
        'med_amt': 'median',
        'std_amt': 'std',
        'mode_smt': lambda x: x.value_counts().index[0]
    },
    'country': {
        'primary_country': lambda x: x.value_counts().index[0],
        'n_countries': lambda x: x.nunique()
    },
    'device': {
        'primary_device': lambda x: x.value_counts().index[0],
        'n_devices': lambda x: x.nunique()
    },
    'datetime': {
        'account_created': 'min',
        'account_age': lambda x: (datetime.datetime.utcnow() - min(x)).days
    }
}

In [None]:
users = df.groupby('user', as_index=False).agg(aggregations)
users.columns = users.columns.droplevel()
users = users.rename(columns={'':'user'})
users.head()

### Assign columns with custom function

In [None]:
gb = df.groupby('user').min()['datetime']
def getvalue(x):
    return gb[x]

df['created'] = df['user'].map(getvalue)
df.head()

### Assign Column with Transform

In [None]:
df['created'] = df.groupby('user')['datetime'].transform(np.min)
df.head()

## Group by

### Single column

In [None]:
# What's the positive rate by country?
df[['country', 'class']].groupby(
    ['country'], as_index=False).mean().sort_values(
    by='class', ascending=False)

### Multiple Columns

In [None]:
df.groupby(['country', 'label']).median()

### Loop through groups

In [None]:
group = df[:15].groupby('daywk')
for name, data in group:
    print(data[categorical])
    print('name', name, "primary_ctry", data.country.mode())

### Get Min/Max/Sum for Group

In [None]:
# Multiple columns
group = df.groupby('daywk')
group.aggregate(np.min).T

In [None]:
# Single Column
group.created.describe()

## Drop/Rename Columns

In [None]:
# Rename
df.rename(columns={'class': 'class1'}, inplace=True)
df.rename(columns={'class1': 'class'}, inplace=True)

# Drop
df.drop(labels=['time'], axis=1, inplace=True)
df.head()

## Null Handling

### Drop Null

In [None]:
# Specific column
df = get_data()
df = df.dropna(how='any', subset=['country'])

In [None]:
# Any column
df = df.dropna(how='any')

### Default Values

In [None]:
# Constant
df['label'].fillna(value=-1, inplace=True)

# Median
df['label'].fillna(value=df['label'].median(), inplace=True)

# Multiple
df = df.fillna({
    "label": -1,
    "country": "unkown",
})

### Conditional Values

In [None]:
df['amount'] = np.where((df['amount'] > 2000) & (df['amount'] < 10), 1000, df['amount'])

### Bfill / Ffill

In [None]:
df['label'].fillna(method='bfill', inplace=True)
df['label'].fillna(method='ffill', inplace=True)

## Encoding

### One-Hot Encoding

In [None]:
# Used when order doesn't matter
df = get_data()
df = pd.get_dummies(df, columns=['country'], prefix=['ctr'])
df.head()

### Ordinal Encoding

In [None]:
# Used when order matters (ordinal)
df['country'] = df['country'].astype('category')
df['country'] = df['country'].cat.codes
df.head()

### Binning

In [None]:
# Used to group continuous values into intervals
df = get_data()
df['amount_band'] = pd.cut(df['amount'], 5, labels=[0,1,2,3,4])
df.head()

## Normalization

### Calculate subgroup statistics

In [None]:
# Standardization by subgroup statistics
group = df.groupby('daywk')
zscore = lambda x: (x - x.mean()) / x.std()
group.transform(zscore).head()

### Standardization

In [None]:
# Subtract mean and divide by standard deviation
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

### Normalization (Min/Max)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

### Binarization

In [None]:
# Binarization (continuous --> 0/1)
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
X = binarizer.transform(X)

## Feature Engineering

### Conditional

In [None]:
df = get_data()

# np.where
df['new'] = np.where(df['amount'] > 10, 1, 0)

# df.loc
df.loc[ df['amount'] > 10, 'amount'] = 1

df.head()

### Map

In [None]:
df['yo'] = df[target].map({
    0:'false', 
    1:'true'
})
df.head()

### Membership

In [None]:
popular = ['cn', 'us']
df['popular_country'] = df.country.isin(popular).astype(int)

## Train/Test Split

### Convert to Numpy

In [None]:
def get_Xy(df):
    X = df.loc[:, df.columns != 'label'].as_matrix()
    y = df['label'].values
    return X, y

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, shuffle=True # optional shuffle
)

### KFold Split

In [None]:
from sklearn.model_selection import StratifiedKFold
fold = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)

### Time Series Split

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def get_train_val_test(df, columns):
    df = get_data().sort_values(by='time')
    df = df[columns]
    trn_split = int(len(df)*.6)
    val_split = trn_split + int(len(df)*.2)
    tst_split = val_split + int(len(df)*.2)
    
    train = df[:trn_split]
    val = df[trn_split:val_split]
    test = df[val_split:]
    
    return train, val, test

### Resample

In [None]:
from sklearn.utils import resample

def get_resampled(df, min_ratio=.1):
    positive = df.loc[df.label == 1]
    negative = df.loc[df.label == 0].sample(frac=.5)  # undersample
    n_pos = int(min_ratio*len(negative))
    positive = resample(positive, n_samples=n_pos)  # oversample
    return pd.concat([negative, positive])

## Modeling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1.0) # C = regularization, lower = more
model.fit(X_train, y_train)
model.score(X_val, y_val)

In [None]:
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)
model.coef_ # pos = increase log odds of label

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)
model.score(X_val, y_val)

In [None]:
# Feature importance
pd.Series(
    model.feature_importances_, 
    index=df.columns[:-1]).sort_values(
    ascending=False)[:10]

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
model.score(X_val, y_val)

### K-Means

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=4)
#model.fit(X_train)

## Grid Search / CV

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
# http://scikit-learn.org/stable/modules/cross_validation.html#multimetric-cross-validation
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection


### Grid Search

In [None]:
# http://scikit-learn.org/stable/modules/grid_search.html
from sklearn.grid_search import GridSearchCV
params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn, param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

## Metrics

In [None]:
# Accuracy
model.score(X_train, y_train)

In [17]:
# Precision/Recall
from sklearn.metrics import precision_score, recall_score

In [18]:
# F1 score
from sklearn.metrics import f1_score

In [None]:
# F2 Score
from sklearn.metrics import fbeta_score
fbeta_score(y_test, tst_preds, pos_label=1, beta=2)

In [None]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, tst_preds))

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
scikitplot.metrics.plot_confusion_matrix(y_test, tst_preds);

In [None]:
# ROC Curve - Recall / Specificity (FPR)
import scikitplot
scikitplot.metrics.plot_roc_curve(y_test, tst_probs);

In [None]:
# Precision/Recall Curce
import scikitplot
skplt.metrics.plot_precision_recall_curve(y_test, tst_probs);

## Similarity

### Distance Metrics

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

# Given a matrix of numerical vectors
# Calculate the similarity of each to all other vectors
cos_sim = cosine_similarity(X, X)
euc_sim = euclidean_distances(X, X)

In [None]:
def get_similar_users(df, simmat):
    for idx in range(len(simmat)):
        sorted_idxs = np.argsort(simmat[idx])[::-1]

        most_sim_idx = sorted_idxs[1:6]
        most_sim_val = simmat[idx][most_sim_idx]
        
        least_sim_idx = sorted_idxs[-5:]
        least_sim_val = simmat[idx][least_sim_idx]
    
get_similar_users(df, cos_sim)

### KNN