In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from src import find_empty_columns,  find_duplicates

# Load raw data

In [None]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

In [None]:
data.info()

In [None]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

In [None]:
# quick look at data 
data.head()

In [None]:
# Composition of target data, this demonstrates imbalanced data. And thus accuracy alone is not a good metric for assessing performance of model. 
data['FLOZVPMFT4626A'].value_counts()

# Clean up data

## Find and drop dulicate columns

In [None]:
duplicates = find_duplicates(data)

In [None]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

In [None]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

In [None]:
df.head()

## Drop columns that have > 80% missing values 

In [None]:
def find_empty_columns(df, threshold=0.80):
    """A list of columns that have more > 80% missing values. 

    For each column computes the number of missing values. 
    If the value is greater than 80%, relative to column length,
    then column name is added to list.  

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe with columns to search
    threshold : int (optional)
        threshold for deleting columns 

    Returns
    -------
    empty_columns : list
        a list of columns with mostly empty values. 
    """
    empty_columns = []
    for column in df:
        if df[column].isna().sum() / len(df) > threshold:
            empty_columns.append(column)
    return empty_columns

In [None]:
empty_columns = find_empty_columns(df)

In [None]:
print('Number of mostly empty columns dropped: {}'.format(len(empty_columns)))

In [None]:
# data frame excluding dropped columns 
df = df.drop(columns=empty_columns)

In [None]:
df.head()

## For data type int64 or float64 drop columns with low variance
motivated by the Variance Threshold function https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
def find_low_variance(data, threshold=0.18):
     """Finds columns with low variance.

    Takes a dataframe as input. Creates a list of columns with low threshold.
    These columns can then be dropped from original dataframe.

    Parameters
    ---------------
    data : pandas.DataFrame
        dataframe with columns to search
    threshold : int (optional)
        threshold for deleting columns

    Returns
    ---------------
    low_variance_columns : list
        a list of columns with low variance.
    """
    low_variance_columns = []
    for column in data.columns: 
        if (data[column].dtype == 'float64') or (data[column].dtype == 'int64'): 
            if data[column].var() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns

In [None]:
# data frame excluding dropped columns 
low_var_columns = find_low_variance(df)
df.drop(columns=low_var_columns, inplace=True)

In [None]:
print('Number of low variance columns dropped: {}'.format(len(low_var_columns)))

In [None]:
df.head()

## Drop Categorical values with low variance by converting to labels to dummy variables and summing the standard deviation 

In [None]:
def find_low_var_categories(data, threshold=0.18):
     """Finds categorical columns with low variance.

    Takes a dataframe as input. Creates a list of columns with low threshold.
    These columns can then be dropped from original dataframe.

    Parameters
    ---------------
    data : pandas.DataFrame
        dataframe with columns to search
    threshold : int (optional)
        threshold for deleting columns

    Returns
    ---------------
    low_variance_columns : list
        a list of columns with low variance.
    """
    low_variance_columns = []
    for column in data.columns: 
        if data[column].dtype == 'O': 
            if pd.get_dummies(df[column]).var().sum() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns
    

In [None]:
low_v_categories = find_low_var_categories(df)
df.drop(columns=low_v_categories, inplace=True)

In [None]:
print('Number of low variance category columns dropped: {}'.format(len(low_v_categories)))

In [None]:
df.head()

## Replace NaN values in numerical columns with median value of series 

In [None]:
def replace_NaN(data):
    """Replace NaN values with the median from numerical column. 

    Takes a dataframe as input. Iterates through the dataframe for numeric types columns.   
    Replaces any NaN values with median value of column.  

    Parameters
    ---------------
    data : pandas.DataFrame
        dataframe with columns to search

    Returns
    ---------------
    data : pandas.DataFrame
        dataframe with columns containing replaced NaNs. 
    """

    for column in data.columns:
        if data[column].dtype != 'O':
            data[column].fillna(data[column].median(), inplace=True)

    return data

In [None]:
df= replace_NaN(df)

In [None]:
df.head()

## Find date like columns

In [None]:
def time_like(df):
    """Finds time like columns.

    Takes a dataframe as input. Iterates through the dataframe columns. 
    Returns a list of potential time like columns. 

    Parameters
    ---------------
    df: pandas.DataFrame
        dataframe with columns to search

    Returns
    ---------------
    low_variance_columns : list
        a list of columns with low variance.
    """
    time_columns = []
    for column in df:
        if df[column].dtype != 'O':
            if df[column].mean() / 1000 > 1:
                time_columns.append(column)
    return time_columns

In [None]:
time_columns = time_like(df)

In [None]:
df[time_columns].head()

In [None]:
# Visually inspecting items that are not potential time stamps
not_time_columns = ['ibe8588DPLHE7435F', 'ibe8840PMLTL7040B']
real_time_columns = [item for item in time_columns if item not in not_time_columns]

In [None]:
def convert_to_timestamp(df, time_columns):
    """Coverts columns in a dataframe to datetime.

    Takes a dataframe and list of columns as input. 
    Converts columns in list to datetime format, depending on the length of value. 
    For values longer than 6, only the year is stripped. 

    Parameters
    ---------------
    df: pandas.DataFrame
        dataframe with columns to search
    time_columns: list 
        list of time like columns

    Returns
    ---------------
    df: pandas.DataFrame
        dataframe with converted time columns 
    """
    for column in df[time_columns]:
        # convert year
        if len(str(int(df[column][0]))) == 4:
            # convert to time stamp and then to int
            df[column] = pd.to_datetime(df[column].astype(
                int), format='%Y').astype(np.int64)

        elif len(str(int(df[column][0]))) == 5:
            # convert to time stamp and then to int
            df[column] = pd.to_datetime(df[column].astype(
                int), format='%Y%m').astype(np.int64)

        # strip the year
        elif len(str(int(df[column][0]))) == 6:
            # truncate to year month and convert to time stamp, then to int
            df[column] = df[column].astype(
                int).astype(str).apply(lambda x: x[:4])
            df[column] = pd.to_datetime(
                df[column], format='%Y').astype(np.int64)

    return df

In [None]:
df = convert_to_timestamp(df, real_time_columns)

In [None]:
df[ real_time_columns].head()

## Save cleaned up dataframe as csv

In [None]:
filename = 'data/interim/cleaned_columns.csv'
df.to_csv(path_or_buf=filename, index=False)

# Load cleaned data 

In [None]:
filename = 'data/interim/cleaned_columns.csv'
df = pd.read_csv(filename)
df.head()

# Preprocess data

In [None]:
# Load data 
X =pd.get_dummies(df.iloc[:,1:10]).values
target = df.iloc[:,0].values

In [None]:
# split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, target, random_state=2)
print(Xtrain.shape, Xtest.shape)

In [None]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [None]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

# Models 

In [None]:
# label data
target_names = ['0', '1']

## Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [None]:
accuracy_score(ytest, ypred)

In [None]:

print(classification_report(ytest, ypred,target_names=target_names))

## Support Vector Machine

## Stochastic Gradient Descent (SGD)
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20, class_weight='balanced')
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

## Regression Forest 

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier( n_estimators=100, n_jobs=-1, class_weight='balanced')
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

### Boosted Trees 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()

In [None]:
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

## Guasian Niave Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB
# train the model
clf = GaussianNB()
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))