# Data preparation and data cleaning

In [18]:
import numpy as np
import pandas as pd

# Load new train and test datasets.
train = pd.read_csv("train2.csv", low_memory=False)
test = pd.read_csv("test2.csv", low_memory=False)

In [19]:
# From checking the number of unique values of object columns, we know that
# the variable "PA_PQ_TAG" has the second most unique object values in the datasets.
# Show the unique values of PA_PQ_TAG in train/test datasets.

PA_PQ_TAG_train = set(train["PA_PQ_TAG"].dropna())
PA_PQ_TAG_train = sorted(PA_PQ_TAG_train)
print("Train dataset", "\n", "PA_PQ_TAG_train[:10]:", "\n", PA_PQ_TAG_train[:10], "\n", "PA_PQ_TAG_train[-10:]:", "\n", PA_PQ_TAG_train[-10:])
print()
PA_PQ_TAG_test = set(test["PA_PQ_TAG"].dropna())
PA_PQ_TAG_test = sorted(PA_PQ_TAG_test)
print("Test dataset", "\n", "PA_PQ_TAG_test[:10]:", "\n", PA_PQ_TAG_test[:10], "\n", "PA_PQ_TAG_test[-10:]:", "\n", PA_PQ_TAG_test[-10:])
print()

Train dataset 
 PA_PQ_TAG_train[:10]: 
 ['Green Channel', 'PQ with Account', 'PQ with Coapp', 'PQ with Dedupe', 'PQ with KYC', 'PQ with Neg dedupe', 'PQ with Ref dedupe', 'PQ_WITH_INCOME_1000K', 'PQ_WITH_INCOME_100K', 'PQ_WITH_INCOME_103K'] 
 PA_PQ_TAG_train[-10:]: 
 ['PQ_WITH_INCOME_91K', 'PQ_WITH_INCOME_92K', 'PQ_WITH_INCOME_93K', 'PQ_WITH_INCOME_94K', 'PQ_WITH_INCOME_95K', 'PQ_WITH_INCOME_96K', 'PQ_WITH_INCOME_97K', 'PQ_WITH_INCOME_98K', 'PQ_WITH_INCOME_99K', 'STP']

Test dataset 
 PA_PQ_TAG_test[:10]: 
 ['Green Channel', 'PQ with Account', 'PQ with Coapp', 'PQ with Dedupe', 'PQ with KYC', 'PQ with Neg dedupe', 'PQ with Ref dedupe', 'PQ_WITH_INCOME_1000K', 'PQ_WITH_INCOME_100K', 'PQ_WITH_INCOME_104K'] 
 PA_PQ_TAG_test[-10:]: 
 ['PQ_WITH_INCOME_88K', 'PQ_WITH_INCOME_89K', 'PQ_WITH_INCOME_90K', 'PQ_WITH_INCOME_91K', 'PQ_WITH_INCOME_92K', 'PQ_WITH_INCOME_93K', 'PQ_WITH_INCOME_95K', 'PQ_WITH_INCOME_96K', 'PQ_WITH_INCOME_99K', 'STP']



In both train and test datasets, the variable "PA_PQ_TAG" has a number of values like "PQ_WITH_INCOME_\d+K". Both sets have same values which are not "PQ_WITH_INCOME_\d+K". Next, I'm going to replace all "PQ_WITH_INCOME_\d+K" values with "\d+" and other values with "-999.0", "-998.0", "-997.0", etc.

In [20]:
# To replace "PQ_WITH_INCOME_\d+K" values with "\d+". 
# To replace other values with "-999.0".
# Output new train and test datasets to csv files.

def replace_patag(dataframe):
    
    import re
    
    non_income_K = []
    for i in range(0, len(dataframe["PA_PQ_TAG"])):
        match = re.search(r"PQ_WITH_INCOME_\d+K", str(dataframe.loc[i, "PA_PQ_TAG"]))
        if not match:
            non_income_K.append(str(dataframe.loc[i, "PA_PQ_TAG"]))
    non_income_K = sorted(list(set(non_income_K)))

    PA_PQ_TAG = []
    for i in range(0, len(dataframe["PA_PQ_TAG"])):
        match = re.search(r"PQ_WITH_INCOME_\d+K", str(dataframe.loc[i, "PA_PQ_TAG"]))
        if match:
            d = re.findall(r"\d+", dataframe.loc[i, "PA_PQ_TAG"])
            PA_PQ_TAG.append(float(d[0]))
        else:
            for j in range(0, len(non_income_K)):
                if str(dataframe.loc[i, "PA_PQ_TAG"]) == non_income_K[j]:
                    PA_PQ_TAG.append(-999.0+j)

    dataframe.loc[:, "PA_PQ_TAG"] = PA_PQ_TAG
    
replace_patag(train)
train.to_csv("train3.csv", index=None)
replace_patag(test)
test.to_csv("test3.csv", index=None)

Now, we can stop here and take a break.