In [1]:
# %load assign_labels.py
import pandas as pd
import random

# For reproducibility
random.seed(100)


# Loading data
train=pd.read_csv("us-train-dataset.csv")
test=pd.read_csv("us-test-dataset.csv")

# Sampling from uniform distribution so that both classes are balanced
train['label']=[round(random.uniform(0,1)) for _ in range(len(train))]
test['label']=[round(random.uniform(0,1)) for _ in range(len(test))]

# Writing data with labels to csv file
train.to_csv("train.csv",index=False)
test.to_csv("test.csv",index=False)

In [2]:
# %load model.py
import random
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import spacy
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing  import FunctionTransformer
from joblib import dump, load
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

nlp=spacy.load("en_core_web_sm")

def transform(X):
    # Changing the shape of input for training
    a=np.array(X['Building_Number']).reshape(-1,1)
    b=np.concatenate([np.vstack(X[col]) for col in X if X[col].dtype==object],axis=1)
    return np.hstack([a,b])

def Word2Vec(X):
    # Converting strings to vectors
    X=pd.DataFrame(X)
    unnamed=True
    for idx,i in X.iterrows():
        for col,data in i.iteritems():
            if(type(data)!=float):
                i[col]=nlp(data).vector
            else:
                if unnamed==True:
                    X.rename(columns={col:'Building_Number'},inplace=True)
                    unnamed=False
        X.iloc[idx]=i
    return X

def train_model(X_train,y_train):
    # Pipeline for classification
    pipeline=Pipeline(
        steps=[
            ('Data Preprocessing',ColumnTransformer([
                ('Drop','drop',['Country','Address']),
                ("Filling null values for string columns",SimpleImputer(strategy="constant",fill_value=" "),['Building_Name', 'City', 'Recipient', 'Street_Name','Zip_Code', 'State']),
                ("Filling null values for numeric columns",SimpleImputer(strategy="constant",fill_value=0),['Building_Number']),
            ],remainder='passthrough')),
            ('String to vector',FunctionTransformer(Word2Vec)),
            ('Changing input shape',FunctionTransformer(transform)),
            ('Classifier',AdaBoostClassifier(random_state=10,n_estimators=1300))
        ]
    )
    return pipeline.fit(X_train,y_train)

def save_model(model):
    dump(model,'AdaBoost.joblib')

def load_model():
    return load('AdaBoost.joblib')

def main():

    # Loading data
    train_data=pd.read_csv("train.csv")
    test_data=pd.read_csv("test.csv")

    if displayFileCount:
        # Displaying number of records
        print()
        print("Total number of records in train.csv -",len(train_data))
        print("Total number of records in test.csv -",len(test_data))
        print("Total records -",len(train_data)+len(test_data))

    # Splitting data into train and validation
    train_data=train_data.sample(frac=1,random_state=10)

    val_data=train_data[100:]
    train_data=train_data[:100]

    # Separating inputs and labels
    X_train=train_data.drop(columns=['label'])
    y_train=train_data['label']

    X_val=val_data.drop(columns=['label'])
    y_val=val_data['label']

    X_test=test_data.drop(columns=['label'])
    y_test=test_data['label']

    # Getting model
    if train:
        print()
        print("Training model")
        ABC=train_model(X_train,y_train)
        save_model(ABC)
    else:
        print()
        try:
            ABC=load_model()
            print("Loaded model")
        except:
            raise Exception("No trained model available. Change the option 'train' to True.")

    # Getting predictions from model
    predictions_train=ABC.predict(X_train)
    predictions_val=ABC.predict(X_val)
    predictions_test=ABC.predict(X_test)

    if displayTPTN:
        print()
        # Computing total number of real addresses which are True
        print("Total number of real addresses which are True from train set -",sum((y_train==True)&(predictions_train==y_train)))
        print("Total number of real addresses which are True from val set -",sum((y_val==True)&(predictions_val==y_val)))
        print("Total number of real addresses which are True from test set -",sum((y_test==True)&(predictions_test==y_test)))

        # Computing total number of fake addresses which are False
        print("Total number of fake addresses which are False from train set -",sum((y_train==False)&(predictions_train==y_train)))
        print("Total number of fake addresses which are False from val set -",sum((y_val==False)&(predictions_val==y_val)))
        print("Total number of fake addresses which are False from test set -",sum((y_test==False)&(predictions_test==y_test)))
    
    if predictForRandomRecord:
        print()
        print("Prediction for a randomly selected record")
        # Selecting a random record and giving its prediciton
        idx=random.randint(0,len(test_data)-1)
        record=test_data.iloc[idx]
        x=record.drop(columns=['label'])
        y=record['label']
        print(pd.DataFrame({
            'Address':x['Address'],
            'Prediction':predictions_test[idx],
            'Actual':y
        },index=[0]))
    
    if displayRealFakeAddresses:
        print()
        # Displaying identified real and fake addresses
        real=pd.concat([train_data[predictions_train==True].Address,val_data[predictions_val==True].Address,test_data[predictions_test==True].Address])
        fake=pd.concat([train_data[predictions_train==False].Address,val_data[predictions_val==False].Address,test_data[predictions_test==False].Address])
        print("Real Addresses")
        print(real)
        print()
        print("Fake addresses")
        print(fake)

    if displayClassificationReport:
        preds=np.hstack([predictions_train,predictions_val,predictions_test])
        actuals=np.hstack([y_train,y_val,y_test])
        print()
        print("Confusion Matrix")
        cm=confusion_matrix(actuals,preds)
        # Displaying confusion matrix
        print(pd.DataFrame(cm,columns=['Predicted: NO','Predicted: YES'],index=['Actual: NO','Actual: YES']))
        print()
        TN=cm[0][0]
        FN=cm[1][0]
        FP=cm[0][1]
        TP=cm[1][1]
        # Displaying TP, FP, FN, TN
        print("True Positives(TP) -",TP)
        print("False Positives(FP) -",FP)
        print("False Negatives(FN) -",FN)
        print("True Negatives(TN) -",TN)
        print()
        # Displaying accuracy
        print("Accuracy Score -",accuracy_score(actuals,preds))
    return predictions_test

if __name__=="__main__":
    ######################################
    # Change options according to purpose
    ######################################
    train=True
    displayFileCount=True
    displayTPTN=True
    predictForRandomRecord=True
    displayRealFakeAddresses=True
    displayClassificationReport=True

    print()
    print("The model outputs")
    print("True (1) - Real Address")
    print("False (0) - Fake Address")
    x = main()


The model outputs
True (1) - Real Address
False (0) - Fake Address

Total number of records in train.csv - 120
Total number of records in test.csv - 24
Total records - 144

Training model

Total number of real addresses which are True from train set - 54
Total number of real addresses which are True from val set - 4
Total number of real addresses which are True from test set - 5
Total number of fake addresses which are False from train set - 46
Total number of fake addresses which are False from val set - 5
Total number of fake addresses which are False from test set - 9

Prediction for a randomly selected record
                                             Address  Prediction  Actual
0  C/O PAT S SPAGHETTI, LEXINGTON, TN 38351, Unit...           0       0

Real Addresses
45    Four Saints Recreation Center, 527 Mikes St,NE...
79    506 S Murdock Hwy # 301,DEERFIELD BEACH,FL,334...
56         6428 E Sidney Blvd,SARASOTA,FL,34243-2249,US
99    2515 Hancock Point Ct # J,WINTER PARK,FL,3

In [3]:
x

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0], dtype=int64)

In [4]:
len(x)

24

In [7]:
x[8]

0

In [8]:
test

Unnamed: 0,Address,Building_Name,Building_Number,City,Recipient,Street_Name,Zip_Code,State,Country,label
0,"223 NW STATE STRE, DOVER, DE 19001, United States",,223.0,DOVER,,NW STATE STRE,19001,DE,United States,0
1,"216 LIBERTY MT ROAD, CHATTANOOGA, TN 37405, Un...",,216.0,CHATTANOOGA,,LIBERTY MT ROAD,37405,TN,United States,1
2,"HIGHWAY 11-E, RUSSELLVILLE, TN 00000, United S...",,,RUSSELLVILLE,,HIGHWAY 11-E,00000,TN,United States,1
3,"7913 HWY 51 N, MILLINGTON, TN 38053, United St...",,7913.0,MILLINGTON,,HWY 51 N,38053,TN,United States,1
4,"PO BOX 481, BALDWIN, GA, 30511-0484,, United S...",,,BALDWIN,,,30511-0484,GA,United States,0
5,"PO BOX 457, NORTONVILLE, KS 660600452,United S...",,,NORTONVILLE,,,660600452,KS,United States,0
6,"GULLY RD PO BOX 62, SO POMFRET, VT, 05067,, Un...",,,,,GULLY RD,05067,VT,United States,1
7,"721 JAMES MCSHANE BLVD PO BOX 129, SALISBURY, ...",,721.0,SALISBURY,,JAMES MCSHANE BLVD,28145-1127,NC,United States,0
8,"C/O ABC CORP SYSTEM, KNOXVILLE, TN 37902, Unit...",,,KNOXVILLE,C/O ABC CORP SYSTEM,,37902,TN,United States,1
9,"C/O J MCKEEN RTE, COLUMBIA, TN 00000, United S...",,,COLUMBIA,C/O J MCKEEN RTE,,00000,TN,United States,0
