# Setup TensorFlow U.S.E. model and download Kaggle W2 data

In [8]:
import os
import pandas as pd
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

kaggle_w2_folder = 'enter-path-here'
use_model_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

In [16]:
def download_kaggle_dataset():
    os.environ['KAGGLE_USERNAME'] = 'username-here'
    os.environ['KAGGLE_KEY'] = 'insert-key-here'

    from kaggle.api.kaggle_api_extended import KaggleApi

    dataset = 'mcvishnu1/fake-w2-us-tax-form-dataset'
    path = 'datasets/fake_w2'

    api = KaggleApi()
    api.authenticate()

    api.dataset_download_file(dataset, 'W2_Multi_Clean_DataSet_02.ZIP', path)

def extract_kaggle_dataset():
    from zipfile import ZipFile
    file_name = "C:/datasets/fake_w2/W2_Multi_Clean_DataSet_02.ZIP"
    os.chdir('enter-path-here')

    with ZipFile(file_name, 'r') as zip:	
        zip.printdir()	
        print('Extracting all the files now...')
        zip.extractall()
        print('Done!')

def get_text(path):
    from PyPDF2 import PdfReader
    reader = PdfReader(path)
    page = reader.pages[0]
    return page.extract_text()

def get_pdf_files():
    return os.listdir(kaggle_w2_folder) 

def extract_save_text_from_pdfs():
    pdfs_as_text_list = [get_text(kaggle_w2_folder+pdf_path) for pdf_path in get_pdf_files() if pdf_path.endswith(".pdf")]

    df = pd.DataFrame(
        pdfs_as_text_list,
        columns=['pdf_as_raw_text'])
    
    df.to_csv(
        'enter-path-here',
        index=False)

    return df

print('downloading kaggle dataset as zip')
download_kaggle_dataset()

print('extracting data from zipped file')
extract_kaggle_dataset()

print('extracting text from pdfs')
extract_save_text_from_pdfs()

print('saving csv to disk, to skip re-running the downloading/unzipping/extracting of PDFs to text everytime the code runs')

print('skip this cell after running it once')

downloading kaggle dataset as zip
extracting data from zipped file
extracting text from pdfs
saving csv to disk, to skip re-running the downloading/unzipping/extracting of PDFs to text everytime the code runs
skip this cell after running it once


# TensorFlow Semantics using Universal Sentence Encoder & Semantics EDA

In [10]:
import sweetviz as sv
import pandas as pd

def get_semantics(text_list):
    use_model = hub.load(use_model_url)
    return use_model(text_list)

print('reading csv')
text_list = pd.read_csv('C:/Users/SUPREME/Documents/SurePrep/pdfs_as_raw_text.csv')

print('getting semantics')
semantics = get_semantics(text_list)
semantic_features = pd.DataFrame(semantics).astype(float)
analyze_report = sv.analyze(semantic_features,pairwise_analysis='on')
analyze_report.show_html('report.html', open_browser=False)

reading csv
getting semantics


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad

Report report.html was generated.


# Semantics Classification Model

# Methods

In [11]:
def get_semantics(text_list):
    use_model = hub.load(use_model_url)
    return use_model(text_list)
    
def get_cluster_ids(semantics):
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(semantics)
    return kmeans.labels_

def merge(cluster_ids,list):    
    assert len(cluster_ids) == len(list)
    return [(text,id) for id,text in zip(cluster_ids,list.values)]

def save(clustered_semantics):
    pd.DataFrame(
        clustered_semantics,
        columns=['text','cluster_id'])\
            .to_csv(
                'C:/Users/SUPREME/Documents/SurePrep/clustered_semantics.csv',
                index=False)

def shuffle_get_features_labels(semantic_features_with_cluster_ids):
    df = pd.DataFrame(
        semantic_features_with_cluster_ids,
        columns=['semantic_features','cluster_id'])
    df_shuffled = shuffle(df)
    return [list(x) for x in df_shuffled.semantic_features.values], df_shuffled.cluster_id.values

def merge_semantic_features_with_cluster_ids(semantic_features,cluster_ids):
    assert len(semantic_features) == len(cluster_ids)
    return [(X,y) for X,y in zip(semantic_features.values,cluster_ids)]

def print_model_results(y_test,y_predictions):
    print('confusion matrix')
    print(confusion_matrix(y_test, y_predictions))
    print('f1-score')
    print(f1_score(y_test, y_predictions, average='micro'))
    print("Classification Report :")
    print(classification_report(
        y_test, 
        y_predictions, 
        target_names=['adp1','adp2','irs1','irs2']))

# Model Training/Predictions

In [12]:
print('reading csv')
text_list = pd.read_csv('C:/Users/SUPREME/Documents/SurePrep/pdfs_as_raw_text.csv')

print('getting semantics')
semantics = get_semantics(text_list)
semantic_features = pd.DataFrame(semantics).astype(float)

print('clustering semantics')
cluster_ids = get_cluster_ids(semantics)
semantic_features_with_cluster_ids = merge_semantic_features_with_cluster_ids(
    semantic_features,
    cluster_ids)

print('saving text with cluster Ids')
text_with_cluster_ids = merge(cluster_ids,text_list)
save(text_with_cluster_ids)

print('shuffling df')
X,y = shuffle_get_features_labels(semantic_features_with_cluster_ids)

print('splitting data for train & test')
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=1)

classifier = RandomForestClassifier(
    max_depth=5, 
    random_state=1)

classifier.fit(X_train,y_train)

y_predictions = classifier.predict(X_test)

reading csv
getting semantics
clustering semantics
saving text with cluster Ids
shuffling df
splitting data for train & test


# Classification Model Results

In [15]:
print_model_results(y_test,y_predictions)

confusion matrix
[[52  0  0  0]
 [ 0 49  0  0]
 [ 0  0 52  0]
 [ 0  0  0 47]]
f1-score
1.0
Classification Report :
              precision    recall  f1-score   support

        adp1       1.00      1.00      1.00        52
        adp2       1.00      1.00      1.00        49
        irs1       1.00      1.00      1.00        52
        irs2       1.00      1.00      1.00        47

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

