In [1]:
# General data science libraries
import pandas as pd
import numpy as np

# Preprocessing, sampling, cross validation, and data manipulation packages
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV

# Machine learning models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import tensorflow_hub as hub # For importing sentence encoder model

# Other libraries
import re
from stop_words import get_stop_words
import gc
import sys

# Functions for Checking Memory Usage

In a big-data environment these functions are useful in checking RAM usage and knowing when to manually use Garbage Collector

In [2]:
# Script to view Memory for Garbage Collection 
def obj_size_fmt(num):
    """Function that allows for human readable memory size of an object
    Parameters: number
    Returns: Size taken up in memort in a human readable way"""
    if num<10**3:
        return "{:.2f}{}".format(num,"B")
    elif ((num>=10**3)&(num<10**6)):
        return "{:.2f}{}".format(num/(1.024*10**3),"KB")
    elif ((num>=10**6)&(num<10**9)):
        return "{:.2f}{}".format(num/(1.024*10**6),"MB")
    else:
        return "{:.2f}{}".format(num/(1.024*10**9),"GB")

def memory_usage():
    """Function that shows a dataframe of top objects memory usage
    Parameters: none
    Returns: Dataframe showing objects taking up the most memory"""
    memory_usage_by_variable=pd.DataFrame({k:sys.getsizeof(v)\
    for (k,v) in globals().items()},index=['Size'])
    memory_usage_by_variable=memory_usage_by_variable.T
    
    memory_usage_by_variable=memory_usage_by_variable.sort_values(by='Size',ascending=False).head(10)
   
    memory_usage_by_variable['Size']=memory_usage_by_variable['Size'].apply(lambda x: obj_size_fmt(x))
    
    return memory_usage_by_variable


# Categorical and Numeric Preprocessing

1. Importing the data from the excel files
2. Dropping useless columns and dealing with null values
3. Encoding categorical variables
4. Ensuring that training and unlabeled dataset have identical columns
5. Changing datatypes as necessary

In [3]:
# Importing training set and unlabeled data
df = pd.read_excel('CAC-2022-Training-Data-Set-New.xlsx')
df_test = pd.read_excel('CAC-2022-Test-Data-Set-New.xlsx')

# Dropping useless columns from both datasets
df.drop(['cdf_seq_no','payment_reporting_category'], axis=1, inplace=True)
df_test.drop(['cdf_seq_no','payment_reporting_category'], axis=1, inplace=True)

# Dealing with null values by creating placeholder variable for both training and unlabeled
df['merchant_cat_code'] = df['merchant_cat_code'].fillna(-1)
df['merchant_cat_code'] = df['merchant_cat_code'].astype(str)
df['db_cr_cd'] = df['db_cr_cd'].fillna('O')

df_test['merchant_cat_code'] = df['merchant_cat_code'].fillna(-1)
df_test['merchant_cat_code'] = df['merchant_cat_code'].astype(str)
df_test['db_cr_cd'] = df['db_cr_cd'].fillna('O')


# Encoding all categorical variables using a column transformer
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False), ['sor','merchant_cat_code', 'db_cr_cd', 'payment_category']),
    remainder='passthrough')

# Applying the transformer to both the training and the unlabled set
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names())

transformed_test = transformer.fit_transform(df_test)
transformed_df_test = pd.DataFrame(transformed_test, columns=transformer.get_feature_names())



# Dropping the placeholder columns from both datasets
transformed_df.drop(['onehotencoder__x2_O','onehotencoder__x1_-1.0'],axis=1,inplace=True)
transformed_df_test.drop(['onehotencoder__x2_O','onehotencoder__x1_-1.0'],axis=1,inplace=True)



# Ensuring that the training and unlabeled have the same columns by dropping training-set-only columns
transformed_list_x1 = []
for i in transformed_df.columns:
    if i[0:17] == 'onehotencoder__x1':
        transformed_list_x1.append(i)
        
transformed_list_test_x1 = []
for i in transformed_df_test.columns:
    if i[0:17] == 'onehotencoder__x1':
        transformed_list_test_x1.append(i)
        

del_list = np.setdiff1d(transformed_list_x1,transformed_list_test_x1)
transformed_df.drop(del_list,axis=1,inplace=True)

# Double checking to make sure that the number of columns is identical
print(transformed_df.info())
print(transformed_df_test.info())



transformed_df_test = transformed_df_test[transformed_df.columns]
df = transformed_df
df_test = transformed_df_test


# Changing datatype of 'is_international' column to int
df['is_international'] = df['is_international'].astype(int)
df_test['is_international'] = df_test['is_international'].astype(int)

print(df_test.isna().sum().sum())
# Dropping all null values and reseting index on training and unlabeled
df.dropna(inplace= True)
df.reset_index(drop=True, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 291 entries, onehotencoder__x0_BK to Category
dtypes: object(291)
memory usage: 88.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 291 entries, onehotencoder__x0_BK to Category
dtypes: object(291)
memory usage: 22.2+ MB
None
10837


# Text Preprocessing

1. Selecting text columns to manipulate
2. Making all letters lowercase
3. Removing special characters and numbers
4. Removing all stop-words

In [4]:
# Importing stop-words and naming text columns
stop_words = get_stop_words('en')
list_of_text = ['trans_desc', 'default_brand', 'default_location', 'qrated_brand', 'coalesced_brand']

def remove_stopWords(s):
    """Removes the stopwords aka'common words' from a string
    Parameters: string (input string of any length)
    Returns: string (output string with all english stopwords removed)"""
    s = ' '.join(word for word in s.split() if word not in stop_words)
    return s


def preprocess_text(dataframe):
    """Preprocesses text columns within a datframe
    Parameters: Dataframe with text columns
    Returns: Dataframe with text columns that are all lowercase, void of numbers, special characters and stopwords"""
    for col in list_of_text:
        dataframe[col] = dataframe[col].astype(str)
        dataframe.loc[:,col] = dataframe[col].apply(lambda x : str.lower(x))
        dataframe.loc[:,col] = dataframe[col].apply(lambda x : " ".join(re.findall('[\w]+',x)))
        dataframe.loc[:,col] = dataframe[col].apply(lambda x: remove_stopWords(x))
        dataframe[col] = dataframe[col].str.replace('\d+', '')
    return dataframe

In [5]:
df = preprocess_text(df)
df_test = preprocess_text(df_test)

  dataframe[col] = dataframe[col].str.replace('\d+', '')


# Garbage Collection

Removing large unnecessary objects that are taking up memory space to increase efficiency

In [6]:
# Garbage collection
del transformed_df
del transformed
del transformed_df_test
del transformed_test
gc.collect()
memory_usage()

Unnamed: 0,Size
df,342.92MB
df_test,93.08MB
del_list,10.12KB
transformed_list_x1,3.22KB
_iii,2.75KB
_i3,2.75KB
transformed_list_test_x1,2.47KB
stop_words,1.62KB
_i4,1.18KB
_ii,1.18KB


# Using Universal Sentence Encoder

1. Creating new dataframes of only text columns
2. Loading Universal-Sentence-Encoder-Large from TensorFlow Hub
3. Encoding all of the text for each column

In [7]:
#creating dataframes of text
df_text = df[['trans_desc', 'default_brand', 'default_location', 'qrated_brand', 'coalesced_brand']]
df_test_text = df_test[['trans_desc', 'default_brand', 'default_location', 'qrated_brand', 'coalesced_brand']]

# Downloading tensorflow universal sentence encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [8]:
def encode(text_df):
    for i in list_of_text:
        embeddings = embed(text_df[i])
        #create list from np arrays
        encoded= np.array(embeddings).tolist()
        #add lists as dataframe column
        text_df[i] = encoded
        #check dataframe
        print(f'Done with {i}')

    return text_df

df_text = encode(df_text)
df_test_text = encode(df_test_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df[i] = encoded


Done with trans_desc
Done with default_brand
Done with default_location
Done with qrated_brand
Done with coalesced_brand
Done with trans_desc
Done with default_brand
Done with default_location
Done with qrated_brand
Done with coalesced_brand


# Combining Text Columns with Numeric and Categorical Columns

1. Manipulate Encoded 'Text' from being a column of 512 dimension vectors so that each dimension of each vector is an individual column
2. Combine these newly created columns with the previously created categorical and numeric columns

 **This will increase the dimension of the dataframe to over 2,800**

In [10]:
for i in list_of_text:
    # Looping through columns and making each dimension of the vector into its own column
    column_names = []
    for j in range(512):
        col = f"{i}{j}"
        column_names.append(col)
    
    split_df = pd.DataFrame(df_text[i].tolist(), columns = column_names)
    split_df_test = pd.DataFrame(df_test_text[i].tolist(), columns = column_names)
    
    df_text = pd.concat([df_text, split_df], axis = 1)
    df_test_text = pd.concat([df_test_text, split_df_test], axis = 1)

# Garbage Collection for memory management
del split_df
del split_df_test
gc.collect()

# Concatenating x and y
df_text.drop(labels=list_of_text, axis=1, inplace=True)
df_test_text.drop(labels=list_of_text, axis=1, inplace=True)

df.drop(labels=list_of_text, axis=1, inplace=True)
df_test.drop(labels=list_of_text, axis=1, inplace=True)


# Creating final dataframe for training and labeling
final_df = pd.concat([df_text, df], axis=1)
del df_text
gc.collect()


final_df_test = pd.concat([df_test_text, df_test], axis=1)
del df_test_text
gc.collect()


0

In [11]:
# Testing to make sure the dataframes are consistent
print(final_df.info())
print(final_df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36667 entries, 0 to 36666
Columns: 2846 entries, trans_desc0 to Category
dtypes: float64(2560), int64(1), object(285)
memory usage: 796.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 2846 entries, trans_desc0 to Category
dtypes: float64(2560), int64(1), object(285)
memory usage: 217.1+ MB
None


# Preliminary Model Training and Testing

1. Create x and y variables by dropping the target variable 'Category'
2. Split the labeled data into training and testing with a 80/20 split
3. Loop through 6 base classification algorithms and analyze the accuracy
4. Choose the top two best performing algorithms (Logistic Regression and Multi-Layer Perceptron) to optimize hyperparameters

In [12]:
# Creating x and y
y = final_df['Category'].values
x = final_df.drop(labels=['Category'], axis=1)

# Creating unlabeled x
x_unlabeled_df = final_df_test.drop(labels=['Category'], axis=1)

# Splitting data with 80/20 split and scaling data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_unlabeled = scaler.transform(x_unlabeled_df)

# Manual Garbage Collection
del final_df
gc.collect()

0

In [13]:
names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
    "Feed Forward Neural Network",
    "Gaussian Naive Bayes",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
    GaussianNB(),
    LogisticRegression(),
]

# Looping through classifiers and reporting accuracy
for name, clf in zip(names, classifiers):
        clf.fit(x_train, y_train)
        score = clf.score(x_test, y_test)
        print(f'Accuracy of {name}: {score}')


Accuracy of Nearest Neighbors: 0.7114807744750478
Accuracy of Decision Tree: 0.5775838560130897
Accuracy of Random Forest: 0.6994818652849741
Accuracy of Feed Forward Neural Network: 0.7688846468502863
Accuracy of Gaussian Naive Bayes: 0.10389964548677393
Accuracy of Logistic Regression: 0.7428415598581947


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Optimization of Logistic Regression Hyperparameters
1. Create map of hyperparameters (specifically solvers and c-values)
2. Utilize GridSearch with Cross Validation to report the best performing model

In [14]:
scaler = StandardScaler()
scale_x = scaler.fit_transform(x)
del x
gc.collect()
# Logistic Regression
model = LogisticRegression()
solvers = ['lbfgs', 'liblinear']
penalty = ['l2']
c_values = [0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=2, scoring='accuracy',n_jobs=-1, verbose=10)
grid_result = grid_search.fit(scale_x, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  2.2min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  2.2min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:  2.2min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  9.0min remaining:  5.4min
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:  9.0min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 13.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 13.0min finished


Best: 0.777675 using {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.728393 (0.004329) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.761257 (0.001848) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.761093 (0.001466) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.777675 (0.000061) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}


# Optimization of Neural Network Hyperparameters

1. Create a parameter space with different amount of nodes in hidden layers
2. Utilize GridSearch with Cross Validation to report the best performing model

**Because the best performing model is a 400 neuron single layer MLP, use this to predict unlabeled values**

In [15]:
clf = MLPClassifier()

# Define parameter space
parameter_space = {
    'hidden_layer_sizes': [(500,), (400,), (300,) ,(300,200)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001]
}

# Utilize GridSearch
clf_grid = GridSearchCV(clf, parameter_space, cv=3,n_jobs=-1, verbose=10)
clf_grid.fit(x_train, y_train)
print(clf_grid.score(x_test, y_test))
print(clf_grid.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed: 10.3min remaining: 30.8min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 10.4min remaining: 14.5min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 11.6min remaining:  8.3min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 13.3min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 14.1min finished


0.7949277338423779
{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (400,), 'solver': 'adam'}


# Using Optimized Model to Predict Unlabeled Data

1. Train the model selected (from previous cell) on training data and report testing accuracy
2. Use this model to predict unlabeled data
3. Append predictions to dataframe
4. Output newly labeled dataframe to excel file

In [21]:
# Training Model
clf = MLPClassifier(activation='relu', alpha=.0001, hidden_layer_sizes=(400,), solver='adam')
clf.fit(x_train, y_train)
print(f'Testing Accuracy: {clf.score(x_test, y_test)}')

# Predicting unlabeled data
predictions = clf.predict(x_unlabeled)

Testing Accuracy: 0.7912462503408781


In [22]:
print(predictions)

['Communication Services' 'Communication Services' 'Retail Trade' ...
 'Travel' 'Travel' 'Retail Trade']


In [23]:
predict_data = pd.read_excel('CAC-2022-Test-Data-Set-New.xlsx')

In [24]:
# Append Predictions to Dataframe
predict_data.drop(labels=['Category'], axis=1)
predict_data['Category'] = predictions

predict_data.head()

Unnamed: 0,sor,cdf_seq_no,trans_desc,merchant_cat_code,amt,db_cr_cd,payment_reporting_category,payment_category,is_international,default_brand,default_location,qrated_brand,coalesced_brand,Category
0,HH,T20131230990668080055738,CHECK CRD PURCHASE 11/11 PACKAGE EXPRESS ...,,10.35,D,Card,Check Card,False,PACKAGE EXPRESS,LACEY WA,Package Express,Package Express,Communication Services
1,HH,T201302289918775816,RECUR DEBIT CRD PMT11/11 YP *FRMLY AT&T AD ...,,36.0,D,Card,Debit Card,False,YP *FRMLY AT&T AD,111-111-1111 CA,At And T,At And T,Communication Services
2,HH,T20130726991361190218055,CHECK CRD PURCHASE 11/11 NORMAN G JENSEN IN ...,,27.0,D,Card,Check Card,False,NORMAN G JENSEN IN,111-1111111 MN,Norman G Jensen,Norman G Jensen,Retail Trade
3,HH,T201208319924922772,CHECK CRD PUR RTRN 11/11 TWILIO ...,,20.0,C,Card,Check Card,False,TWILIO,SAN FARANSICO CA,Twilio,Twilio,Communication Services
4,HH,T20131230990638080027066,CHECK CRD PURCHASE 11/11 AT&T D11K 1111 ...,,325.78,D,Card,Check Card,False,AT&T D11K 1111,FORT WORTH TX,At And T,At And T,Communication Services


In [25]:
# Export Dataframe
predict_data.to_excel('CAC-2022-Predicted-Data-Set.xlsx')