In [1]:
import glob

In [2]:
import datetime

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
# Names of the 62 columns used for the data frame. 
main_names=['Code', 'Value'] 
dsn_names =["Type","Version","Month","Year","APEN","APET"]
indiv_names=["Id","Children"]
contract_names=["Status","Libelle Emploi","Nature","Numero","Unite Mesure","Modalite Temps","Emplois Multiples",
              "Employeurs Multiples"]
remuneration_names=["Gross Salary","Gross Salary after Extras"]
activite_names=["Worked","Absent"]
extras_names=["Extra Hours"]
primes_names=["Primes Count","Primes Sum"]
autres_el_types_names=["Autres Types Count","Sum Types Autres","Min Types Autres","Max Types Autres"]
autres_el_values_names=["Autres Values Count","Sum Values Autres","Min Values Autres","Max Values Autres"]
base_types_names=["Base Types Count","Sum Types Base","Min Types Base","Max Types Base"]
base_values_names= ["Base Values Count","Sum Values Base","Mean Values Base","Median Values Base"]
composant_types_names=["Composant Types Count","Sum Types Composant","Min Types Composants","Max Types Composants"]
composant_values_names=["Composant Values Count","Sum Values Composants","Mean Values Composants","Median Values Composants"]
cotisation_types_names=["Cotisations Type Count","Sum Types Cotisations Indiv","Min Types Cotisations Indiv","Max Types Cotisation Indiv"]
cotisation_assiette_names=["Cotisation Assiette Count","Sum Cotisation Assiette","Mean Cotisation Assiette","Median Cotisation Assiette"]
cotisation_values_names=["Cotisation Value Count","Sum Cotisation Value","Mean Cotisation Value","Median Cotisation Value"]
versement_names=["Remuneration Nette Fiscale","Montant Net Verse"]


In [6]:
# Codes used for the function read_codes
dsn_codes=["S20.G00.05.002","S20.G00.05.004","S20.G00.05.005","S21.G00.06.003","S21.G00.11.002"]
indiv_codes=["S21.G00.30.001","S21.G00.30.021"]
contract_codes=["S21.G00.40.002","S21.G00.40.006","S21.G00.40.007","S21.G00.40.009", "S21.G00.40.011", 
                "S21.G00.40.014","S21.G00.40.036","S21.G00.40.037"]
versement_codes=["S21.G00.50.002","S21.G00.50.004"]

In [7]:
# Types used for the function read_types
remuneration_types=["010","001"]
activite_types=["01","02"]

In [8]:
# Function which reads the codes and returns the values respectively. Also transforms the date and time data.  
def read_codes(codes,data,colnames):
    values=[]
    for code in codes:
        data_value=data[data["Code"]==code]["Value"]
        if data_value.values.any():
            if code !="S20.G00.05.005":
                values.append(data_value.iloc[0])
            else: 
                date = pd.to_datetime(data_value,format="%d%m%Y")
                values.append(date.dt.month.iloc[0])
                values.append(date.dt.year.iloc[0])
        else: 
            values.append(0)
    return pd.DataFrame([values],columns=colnames)

In [9]:
# Function used to split the data by individual, contract etc. 
def split(data,code,):
    b= data['Code'].eq(code).cumsum()
    a = data.groupby(b).cumcount()
    return data.set_index([a, b]).unstack().drop(0, axis=1, level=1).sort_index(axis=1, level=1)

In [10]:
# Function used to read specific types and respective values. Note that types are given as a value of a code.  
def read_type(type_data,value_data,types,colnames):
    frame = pd.concat([type_data,value_data],axis=1)
    frame.columns=["Type","Value"]
    frame=frame[frame["Type"].isin(types)].drop_duplicates("Type",keep="first")
    fin_frame = pd.DataFrame([np.zeros(len(types))],columns=types)
    columns = frame["Type"].values.tolist()
    fin_frame.loc[0, columns] = frame["Value"].values.astype(float)
    fin_frame.columns=colnames
    return fin_frame

In [11]:
# Function used to calculate statistics of types and values in order to aggregate data such as contributions.
def stats(data,actions,colnames):
    results=[]
    if data.any():
        for index, action in enumerate(actions):
            action = 'data.'+"astype(float)."+ action +"()"
            results.append(eval(action))
    else: 
        results = np.zeros(len(actions))
    return pd.DataFrame([results],columns=colnames)

In [12]:
# Building the current data frame from the dsn files.

all_files = glob.glob("*.dsn")
final_df = pd.DataFrame([])

# Iterate over each DSN.
for filename in all_files:
    df = pd.read_csv(filename, index_col=False, header=None, names = main_names, quotechar="'",encoding='latin-1')
    
#   Check DSN is of right type (DSN mensuelle).
    dsn_nature=df[df["Code"]=="S20.G00.05.001"]["Value"]  
    if dsn_nature.eq("01").bool():
        
#       Add DSN info.
        dsn=read_codes(dsn_codes,df[df["Code"].isin(dsn_codes)],dsn_names)
    
#       Split per individual.
        indivs_df = split(df,"S21.G00.30.001")
    
#       Iterate over each individual.
        indexes = range(0, indivs_df.shape[1], 2)
        for index1 in indexes:
            indiv_df= pd.DataFrame(indivs_df.iloc[:,index1:index1+2].values,columns=main_names)
            
#           Add individual info.
            indiv=read_codes(indiv_codes,indiv_df[indiv_df["Code"].isin(indiv_codes)],indiv_names)
    
#           Split contract info.
            contracts_df = split(indiv_df,'S21.G00.40.001')
    
#           Split remuneration info.
            remunerations_df=split(indiv_df,'S21.G00.50.001')
    
#           Iterate over each contract.
            indexes = range(0, contracts_df.shape[1], 2)
            for index2 in indexes:
                contract_df = pd.DataFrame(contracts_df.iloc[:,index2:index2+2].values,columns=main_names)
                
#               Truncate contact info at the 50 mark in order to get 40 to 50 info only.
                if contract_df.index[contract_df["Code"]=="S21.G00.50.001"].any():
                    contract_df=contract_df.iloc[0:contract_df.index[contract_df["Code"]=="S21.G00.50.001"].tolist()[0],:]
                else:
                    contract_df=contract_df.dropna()
                    
#               Remuneration data frame(50 onward).
                remuneration_df = pd.DataFrame(remunerations_df.iloc[:,index2:index2+2].values,columns=main_names)
    
#               Combine contracts and their remunaration info.
                contrem_df=pd.concat([contract_df,remuneration_df],axis=0)
    
#               Add contract info.
                contract=read_codes(contract_codes,contrem_df[contrem_df["Code"].isin(contract_codes)],contract_names)
    
#               Add remuneration info.
                remuneration_type=contrem_df[contrem_df["Code"]=="S21.G00.51.011"]["Value"].reset_index(drop=True)
                remuneration_value=contrem_df[contrem_df["Code"]=="S21.G00.51.013"]["Value"].reset_index(drop=True)
                remuneration = read_type(remuneration_type,remuneration_value,remuneration_types,remuneration_names)
                heures_sup=stats(contrem_df[contrem_df["Code"]=="S21.G00.51.012"]["Value"],["sum"],extras_names)
                
#               Add activity info.
                activity_type = contrem_df[contrem_df["Code"]=="S21.G00.53.001"]["Value"].reset_index(drop=True)
                activity_value = contrem_df[contrem_df["Code"]=="S21.G00.53.002"]["Value"].reset_index(drop=True)
                activity = read_type(activity_type,activity_value,activite_types,activite_names)
            
#               Add primes info.
                primes = stats(contrem_df[contrem_df["Code"]=="S21.G00.52.002"]["Value"],["count","sum"],primes_names)
    
#               Add autres elements du revenu brut info.
                autres_el_types=stats(contrem_df[contrem_df["Code"]=="S21.G00.54.001"]["Value"],["count","sum","min","max"],autres_el_types_names)
                autres_el_values=stats(contrem_df[contrem_df["Code"]=="S21.G00.54.002"]["Value"],["count","sum","min","max"],autres_el_values_names)
        
#               Add base assujettie info.
                base_types=stats(contrem_df[contrem_df["Code"]=="S21.G00.78.001"]["Value"],["count","sum","min","max"],base_types_names)
                base_values=stats(contrem_df[contrem_df["Code"]=="S21.G00.78.004"]["Value"],["count","sum","mean","median"],base_values_names)
        
#               Add composant base info.
                composant_types=stats(contrem_df[contrem_df["Code"]=="S21.G00.79.001"]["Value"],["count","sum","min","max"],composant_types_names)
                composant_values=stats(contrem_df[contrem_df["Code"]=="S21.G00.79.004"]["Value"],["count","sum","mean","median"],composant_values_names)
        
#               Add cotisation individuelle info.
                cotisation_types=stats(contrem_df[contrem_df["Code"]=="S21.G00.81.001"]["Value"],["count",'sum',"min","max"],cotisation_types_names)
                cotisation_assiette=stats(contrem_df[contrem_df["Code"]=="S21.G00.81.003"]["Value"],["count",'sum',"mean","median"],cotisation_assiette_names)
                cotisation_values=stats(contrem_df[contrem_df["Code"]=="S21.G00.81.004"]["Value"],["count",'sum',"mean","median"],cotisation_values_names)
            
#               Add versement individu info.
                versement = read_codes(versement_codes,contrem_df[contrem_df["Code"].isin(versement_codes)],versement_names).astype(float)

#               Add filename.
                fil = pd.DataFrame([filename],columns=["Filename"])
    
#               Build row of final data frame.
                final = pd.concat([fil,dsn,indiv,contract,remuneration,heures_sup,activity,primes,autres_el_types,autres_el_values,
                                   base_types,base_values,composant_types,composant_values,cotisation_types,
                                   cotisation_assiette,cotisation_values,versement],axis=1)
    
#               Append row to final data frame.
                final_df = pd.concat([final_df,final],axis=0)
    
# Reset index and print final data frame.
final_df=final_df.reset_index(drop=True)
final_df

Unnamed: 0,Filename,Type,Version,Month,Year,APEN,APET,Id,Children,Status,...,Cotisation Assiette Count,Sum Cotisation Assiette,Mean Cotisation Assiette,Median Cotisation Assiette,Cotisation Value Count,Sum Cotisation Value,Mean Cotisation Value,Median Cotisation Value,Remuneration Nette Fiscale,Montant Net Verse
0,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1740899352909,0,07,...,3.0,3469.57,1156.523333,1642.53,4.0,242.96,60.740000,49.605,1365.11,1413.29
1,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1621099121118,0,07,...,4.0,7065.85,1766.462500,2272.10,5.0,282.84,56.568000,58.850,1869.12,1956.60
2,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1630899127018,0,06,...,2.0,5951.80,2975.900000,2975.90,3.0,446.39,148.796667,102.670,2418.14,2401.33
3,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1740899352909,0,07,...,4.0,8128.64,2032.160000,2612.03,8.0,472.98,59.122500,62.555,1947.47,2326.89
4,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1621099121118,0,07,...,4.0,6572.83,1643.207500,2107.76,8.0,118.40,14.800000,50.480,1636.62,2296.69
5,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1630899127018,0,06,...,1.0,2931.50,2931.500000,2931.50,5.0,598.45,119.690000,75.340,2388.76,2613.03
6,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1751299352806,0,07,...,4.0,5988.59,1497.147500,1927.21,8.0,-8.48,-1.060000,46.155,1398.56,1676.93
7,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1740899352909,0,07,...,3.0,5394.75,1798.250000,2551.10,4.0,376.82,94.205000,77.045,2156.98,2192.02
8,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1621099121118,0,07,...,4.0,6922.21,1730.552500,2224.22,5.0,109.50,21.900000,57.600,1859.79,1912.89
9,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1630899127018,0,06,...,2.0,5925.40,2962.700000,2962.70,3.0,444.41,148.136667,102.210,2407.59,2355.16


In [13]:
# Previous month data frame

#Information that is used to identify the previous data frame.
keep= ["Month","Year","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Copy the current frame to previous.
previous_df = final_df.copy()

# Change the month to the previous month while taking care of the case where the month is January.
previous_df.loc[previous_df['Month'] >= 2, 'Month'] -=1
previous_df.loc[previous_df["Month"]==1,"Year"] -=1
previous_df.loc[previous_df["Month"]==1,"Month"]=12

# Set all the values, other than those kept for identification, in the previous data frame to zero. 
previous_df[previous_df.columns.difference(keep)]=0

# Filter the contracts to give the only the non-faulty version as previous data.
final_df["Version"]=final_df["Version"].astype(int)
new_df = final_df.loc[final_df.groupby(keep)['Version'].idxmax()]

# If it exists, merge the previous data to the previous data frame.
previous_df=previous_df.set_index(keep)
temp = new_df.set_index(keep)
previous_df.update(temp)
previous_df.reset_index(inplace=True)

# Add previous data frame to current data frame.
previous_df=previous_df[final_df.columns]
previous_df.columns="Previous " + previous_df.columns
dataframe= pd.concat([final_df,previous_df],axis=1)
dataframe

Unnamed: 0,Filename,Type,Version,Month,Year,APEN,APET,Id,Children,Status,...,Previous Cotisation Assiette Count,Previous Sum Cotisation Assiette,Previous Mean Cotisation Assiette,Previous Median Cotisation Assiette,Previous Cotisation Value Count,Previous Sum Cotisation Value,Previous Mean Cotisation Value,Previous Median Cotisation Value,Previous Remuneration Nette Fiscale,Previous Montant Net Verse
0,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1740899352909,0,07,...,3.0,3151.42,1050.473333,1490.21,3.0,181.50,60.500000,51.410,1260.82,1280.38
1,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1621099121118,0,07,...,4.0,6958.12,1739.530000,2236.19,4.0,200.38,50.095000,25.575,1862.12,1923.81
2,MAG001_0317_000002.dsn,03,2,3,2017,4120A,4120A,1630899127018,0,06,...,2.0,5932.00,2966.000000,2966.00,2.0,391.52,195.760000,195.760,2410.22,2366.70
3,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1740899352909,0,07,...,3.0,5460.09,1820.030000,2583.77,7.0,463.50,66.214286,69.500,2209.72,2266.99
4,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1621099121118,0,07,...,3.0,4729.13,1576.376667,2239.79,7.0,222.36,31.765714,60.240,1894.46,1963.11
5,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1630899127018,0,06,...,1.0,2968.00,2968.000000,2968.00,5.0,593.49,118.698000,76.280,2421.41,2473.06
6,MAG001_0119_000001.dsn,01,1,1,2019,4120A,4120A,1751299352806,0,07,...,3.0,3939.30,1313.100000,1868.10,7.0,-39.57,-5.652857,50.260,1553.80,1634.74
7,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1740899352909,0,07,...,3.0,3469.57,1156.523333,1642.53,4.0,242.96,60.740000,49.605,1365.11,1413.29
8,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1621099121118,0,07,...,4.0,7065.85,1766.462500,2272.10,5.0,282.84,56.568000,58.850,1869.12,1956.60
9,MAG001_0417_000001.dsn,01,1,4,2017,4120A,4120A,1630899127018,0,06,...,2.0,5951.80,2975.900000,2975.90,3.0,446.39,148.796667,102.670,2418.14,2401.33


In [14]:
# Difference data frame 

# Change the column names of the previous data frame to that of the current data frame in order to apply the subtraction.
previous_df.columns = final_df.columns

# Identify the data points which will not be part of the difference data frame.
keep=["Filename","Type","Version","Month","Year","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Ensure that all data points are numbers and take the difference between previous and current month for each data point.
final_df["Type"]=final_df["Type"].astype(int)
sub = final_df.drop(keep,axis=1)-previous_df.drop(keep,axis=1)

In [25]:
# Encoding the data and finalizing the data frame

# Data points which need to be encoded.
keep=["Filename","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Encode the data points above.
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X= final_df[keep].astype(str).apply(lambda x: x.str.lower())
enc.fit(X)
enc.categories_ 
Y=pd.DataFrame(enc.transform(X),columns=keep)

# Put together the final data frame with the non-encoded, encoded, and difference data.
Z= final_df[["Month","Year","Type","Version"]]
final=pd.concat([Z,Y,sub],axis=1)

# Drop the contracts which appear in a suspicious dsn but aren't faulty.
final = final.drop_duplicates([col for col in list(final.columns) if col not in ('Filename', "Type","Version")])
final

Unnamed: 0,Month,Year,Type,Version,Filename,APEN,APET,Id,Status,Libelle Emploi,...,Cotisation Assiette Count,Sum Cotisation Assiette,Mean Cotisation Assiette,Median Cotisation Assiette,Cotisation Value Count,Sum Cotisation Value,Mean Cotisation Value,Median Cotisation Value,Remuneration Nette Fiscale,Montant Net Verse
0,3,2017,3,2,9.0,0.0,0.0,28.0,2.0,29.0,...,0.0,318.15,106.050000,152.32,1.0,61.46,0.240000,-1.805,104.29,132.91
1,3,2017,3,2,9.0,0.0,0.0,4.0,2.0,25.0,...,0.0,107.73,26.932500,35.91,1.0,82.46,6.473000,33.275,7.00,32.79
2,3,2017,3,2,9.0,0.0,0.0,5.0,1.0,5.0,...,0.0,19.80,9.900000,9.90,1.0,54.87,-46.963333,-93.090,7.92,34.63
3,1,2019,1,1,3.0,0.0,0.0,28.0,2.0,29.0,...,1.0,2668.55,212.130000,28.26,1.0,9.48,-7.091786,-6.945,-262.25,59.90
4,1,2019,1,1,3.0,0.0,0.0,4.0,2.0,25.0,...,1.0,1843.70,66.830833,-132.03,1.0,-103.96,-16.965714,-9.760,-257.84,333.58
5,1,2019,1,1,3.0,0.0,0.0,5.0,1.0,5.0,...,0.0,-36.50,-36.500000,-36.50,0.0,4.96,0.992000,-0.940,-32.65,139.97
6,1,2019,1,1,3.0,0.0,0.0,32.0,2.0,25.0,...,1.0,2049.29,184.047500,59.11,1.0,31.09,4.592857,-4.105,-155.24,42.19
7,4,2017,1,1,12.0,0.0,0.0,28.0,2.0,29.0,...,0.0,1925.18,641.726667,908.57,0.0,133.86,33.465000,27.440,791.87,778.73
8,4,2017,1,1,12.0,0.0,0.0,4.0,2.0,25.0,...,0.0,-143.64,-35.910000,-47.88,0.0,-173.34,-34.668000,-1.250,-9.33,-43.71
9,4,2017,1,1,12.0,0.0,0.0,5.0,1.0,5.0,...,0.0,-26.40,-13.200000,-13.20,0.0,-1.98,-0.660000,-0.460,-10.55,-46.17


In [28]:
# Random forest 

# Data points needed to label the data as suspicious or unsuspicious.
keep= ["Month","Year","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Labelling unsuspicious data as 0 and suspicious data as 1.
final.loc[final.groupby(keep)['Version'].idxmax(), 'Type'] = 0
final.loc[final.index.drop(final.groupby(keep)['Version'].idxmax()),"Type"]=1

# Spliting the predictors and targets.
X=final.drop(["Type","Version"],axis=1)
y=final.loc[:,"Type"]

# Split to training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# Random forest classifier training  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_classifier = RandomForestClassifier(n_estimators=50, random_state=0, class_weight="balanced")
rf_classifier.fit(X_train, y_train)

# Random forest classifier prediction & results

# Predictions (1 & 0)
y_predicted = rf_classifier.predict(X_test)

# Predictions (probabilities per class)
y_predict_prob = rf_classifier.predict_proba(X_test)

# Accuracy score
accuracy = rf_classifier.score(X_test, y_test)
print('accuracy: {}%'.format(round(accuracy*100, 1)))

# Various scores report
cf_report = classification_report(y_test, y_predicted)
print(cf_report)

accuracy: 97.0%
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       162
           1       1.00      0.17      0.29         6

   micro avg       0.97      0.97      0.97       168
   macro avg       0.99      0.58      0.64       168
weighted avg       0.97      0.97      0.96       168



In [29]:
# Threshold changes

for threshold in np.linspace(0.1, 0.5, 5):
    
    # Classication probabilities for each class
    y_predict = [1 if class_prob[1] >= threshold else 0 for class_prob in y_predict_prob]
    
    # Calculate scores of metrics (precision, recall etc)
    print(threshold, classification_report(y_test, y_predict))

0.1               precision    recall  f1-score   support

           0       0.99      0.85      0.91       162
           1       0.14      0.67      0.23         6

   micro avg       0.84      0.84      0.84       168
   macro avg       0.56      0.76      0.57       168
weighted avg       0.96      0.84      0.89       168

0.2               precision    recall  f1-score   support

           0       0.97      0.96      0.97       162
           1       0.22      0.33      0.27         6

   micro avg       0.93      0.93      0.93       168
   macro avg       0.60      0.65      0.62       168
weighted avg       0.95      0.93      0.94       168

0.30000000000000004               precision    recall  f1-score   support

           0       0.97      0.97      0.97       162
           1       0.17      0.17      0.17         6

   micro avg       0.94      0.94      0.94       168
   macro avg       0.57      0.57      0.57       168
weighted avg       0.94      0.94      0.94   

In [30]:
# Naive Bayes

# Split to training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# Naive Bayes classifier training  
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

# Predictions (1 & 0)
y_predicted = clf.predict(X_test)

# Various scores report
cf_report = classification_report(y_test, y_predicted)
print(cf_report)

              precision    recall  f1-score   support

           0       0.97      0.88      0.93       162
           1       0.10      0.33      0.15         6

   micro avg       0.86      0.86      0.86       168
   macro avg       0.53      0.61      0.54       168
weighted avg       0.94      0.86      0.90       168



In [31]:
# Support Vector

# Split to training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

# Support vector classifier training  
from sklearn.svm import SVC
clf= SVC(gamma="auto")
clf.fit(X_train, y_train)

# Predictions (1 & 0)
y_predicted = clf.predict(X_test)

# Various scores report
cf_report = classification_report(y_test, y_predicted)
print(cf_report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       162
           1       0.00      0.00      0.00         6

   micro avg       0.96      0.96      0.96       168
   macro avg       0.48      0.50      0.49       168
weighted avg       0.93      0.96      0.95       168



  'precision', 'predicted', average, warn_for)


In [153]:
# List of data points used to decode the data frame
keep=["Filename","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Decode 
final[keep]=enc.inverse_transform(final[keep])

# List the suspicious file names
final.loc[final["Type"]==1,"Filename"]

# Find the suspicious file and its corresponding unsuspicious file within the difference data frame. 
# The filenames are plugged in manually based on the answers given by the previous line.
doc1 = final[final["Filename"]=="talta3_0318_000001.dsn"]
doc2 = final[final["Filename"]=="talta3_0318_000002.dsn"]

# Compare the suspicious and unsuspicious file if they differ.
for column in doc1.columns:
    if (doc1[column].values!=doc2[column].values).any():
        print(doc1[column])
        print(doc2[column])

10     mag001_0317_000001.dsn
11     mag001_0317_000001.dsn
12     mag001_0317_000001.dsn
71     mag001_0917_000001.dsn
84     mag001_0718_000001.dsn
85     mag001_0718_000001.dsn
86     mag001_0718_000001.dsn
87     mag001_0718_000001.dsn
131    mag001_0419_000001.dsn
133    mag001_0117_000001.dsn
134    mag001_0117_000001.dsn
135    mag001_0117_000001.dsn
136    mag001_0117_000001.dsn
190    talta1_0119_000001.dsn
331    talta2_0519_000001.dsn
423    talta2_0418_000003.dsn
549    talta3_0219_000001.dsn
559    talta3_0219_000001.dsn
843    talta3_1118_000001.dsn
875    talta3_0318_000001.dsn
Name: Filename, dtype: object

In [150]:
# List of data points used to un-encode the data frame
keep=["Filename","APEN","APET","Id","Status","Libelle Emploi","Nature","Numero","Unite Mesure",
        "Modalite Temps","Emplois Multiples","Employeurs Multiples"]

# Un-encoding 
final[keep]=enc.inverse_transform(final[keep])

# List the suspicious file names
final.loc[final["Type"]==1,"Filename"]

# Find the suspicious file and its corresponding unsuspicious file within the current data frame.
# The filenames are plugged in manually based on the answers given by the previous line.
doc1 = final_df[final_df["Filename"]=="TALTA3_0318_000001.dsn"]
doc2 = final_df[final_df["Filename"]=="TALTA3_0318_000002.dsn"]

# Compare the suspicious and unsuspicious file if they differ.
for column in doc1.columns:
    if (doc1[column].values!=doc2[column].values).any():
        print(doc1[column])
        print(doc2[column])


872    TALTA3_0318_000001.dsn
873    TALTA3_0318_000001.dsn
874    TALTA3_0318_000001.dsn
875    TALTA3_0318_000001.dsn
876    TALTA3_0318_000001.dsn
877    TALTA3_0318_000001.dsn
878    TALTA3_0318_000001.dsn
Name: Filename, dtype: object
892    TALTA3_0318_000002.dsn
893    TALTA3_0318_000002.dsn
894    TALTA3_0318_000002.dsn
895    TALTA3_0318_000002.dsn
896    TALTA3_0318_000002.dsn
897    TALTA3_0318_000002.dsn
898    TALTA3_0318_000002.dsn
Name: Filename, dtype: object
872    1
873    1
874    1
875    1
876    1
877    1
878    1
Name: Type, dtype: int64
892    3
893    3
894    3
895    3
896    3
897    3
898    3
Name: Type, dtype: int64
872    1
873    1
874    1
875    1
876    1
877    1
878    1
Name: Version, dtype: int64
892    2
893    2
894    2
895    2
896    2
897    2
898    2
Name: Version, dtype: int64
872    2762.84
873    2689.15
874    2698.31
875    2713.17
876       0.00
877    2668.86
878       0.00
Name: Gross Salary after Extras, dtype: float64
892    27