### Setting up the environment:

In [72]:
#1# Importing libraries: 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

RSEED=42

### base model
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#2# Visualizing the dataset:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Exploring the data:

In [73]:
df_fraud = pd.read_csv('./data/Merge_Frauddet_new.csv')
df_fraud.head(10)

Unnamed: 0,ID,invoice_date,tarif_type,counter_number,counter_statue,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,counter_type,client_catg,region,creation_date,target
0,0,24/03/2014,11,1335667.0,0,8,1,82,0,0,0,4,ELEC,11,101,31/12/1994,0
1,0,29/03/2013,11,1335667.0,0,6,1,1200,184,0,0,4,ELEC,11,101,31/12/1994,0
2,0,23/03/2015,11,1335667.0,0,8,1,123,0,0,0,4,ELEC,11,101,31/12/1994,0
3,0,13/07/2015,11,1335667.0,0,8,1,102,0,0,0,4,ELEC,11,101,31/12/1994,0
4,0,17/11/2016,11,1335667.0,0,9,1,572,0,0,0,12,ELEC,11,101,31/12/1994,0
5,0,17/07/2017,11,1335667.0,0,9,1,314,0,0,0,8,ELEC,11,101,31/12/1994,0
6,0,07/12/2018,11,1335667.0,0,9,1,541,0,0,0,12,ELEC,11,101,31/12/1994,0
7,0,19/03/2019,11,1335667.0,0,9,1,585,0,0,0,8,ELEC,11,101,31/12/1994,0
8,0,22/07/2011,11,1335667.0,0,9,1,1200,186,0,0,4,ELEC,11,101,31/12/1994,0
9,0,22/11/2011,11,1335667.0,0,6,1,1082,0,0,0,4,ELEC,11,101,31/12/1994,0


In [74]:
df_fraud.describe()

Unnamed: 0,ID,tarif_type,counter_number,counter_statue,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,client_catg,region,target
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,103836.467,20.136,120919976164.004,0.042,7.315,1.0,409.745,107.568,20.413,54.06,46.322,11.987,209.995,0.767
std,32065.556,13.482,1646702365322.078,0.375,1.36,0.083,588.275,1169.328,163.644,979.218,2594.754,6.159,103.552,2.661
min,0.0,8.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,101.0,0.0
25%,104718.0,11.0,121473.0,0.0,6.0,1.0,79.0,0.0,0.0,0.0,4.0,11.0,103.0,0.0
50%,112518.0,11.0,490475.0,0.0,6.0,1.0,272.0,0.0,0.0,0.0,4.0,11.0,301.0,0.0
75%,120486.0,40.0,1112725.0,0.0,9.0,1.0,601.0,0.0,0.0,0.0,4.0,11.0,308.0,0.0
max,128439.0,45.0,27381100000000.0,5.0,9.0,50.0,98889.0,819886.0,45360.0,343568.0,294406.0,51.0,399.0,10.0


In [75]:
#add a new colum with total consumption level
df_fraud.eval('sum_consu = consommation_level_1 + consommation_level_2 + consommation_level_3 + consommation_level_4',inplace=True)

### Adjusting the data: 

In [76]:
#1# Checking for missing values: According with the documentation, the data has no NaN values:

df_fraud.isnull().sum()
# The Data is complete. 

ID                      0
invoice_date            0
tarif_type              0
counter_number          0
counter_statue          0
reading_remarque        0
counter_coefficient     0
consommation_level_1    0
consommation_level_2    0
consommation_level_3    0
consommation_level_4    0
months_number           0
counter_type            0
client_catg             0
region                  0
creation_date           0
target                  0
sum_consu               0
dtype: int64

In [77]:
df_fraud

column_to_check = 'target'
df_org_filtered = df_fraud[df_fraud[column_to_check] != 0]

df_org_filtered.head()

Unnamed: 0,ID,invoice_date,tarif_type,counter_number,counter_statue,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,counter_type,client_catg,region,creation_date,target,sum_consu
1384,100032,24/02/2017,11,393848.0,0,9,1,781,0,0,0,8,ELEC,11,308,07/11/2012,10,781
1385,100032,21/06/2016,11,393848.0,0,9,1,382,0,0,0,4,ELEC,11,308,07/11/2012,10,382
1386,100032,26/10/2018,11,393848.0,0,9,1,373,0,0,0,4,ELEC,11,308,07/11/2012,10,373
1387,100032,21/06/2018,11,393848.0,0,9,1,328,0,0,0,4,ELEC,11,308,07/11/2012,10,328
1388,100032,23/02/2018,11,393848.0,0,9,1,703,0,0,0,4,ELEC,11,308,07/11/2012,10,703


In [78]:
#2# Getting the Dummies for X_train: 

#X_train = pd.get_dummies(data=X_train, columns=["counter_statue", "client_catg", "reading_remarque"], dtype=float)
#X_train.info()

categorical_columns= ['counter_statue', 'reading_remarque','client_catg']
df_fraud_dummy = pd.get_dummies(df_fraud, columns=categorical_columns, drop_first=True, dtype=int)
df_fraud_dummy

Unnamed: 0,ID,invoice_date,tarif_type,counter_number,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,...,counter_statue_1,counter_statue_2,counter_statue_3,counter_statue_4,counter_statue_5,reading_remarque_7,reading_remarque_8,reading_remarque_9,client_catg_12,client_catg_51
0,0,24/03/2014,11,1335667.000,1,82,0,0,0,4,...,0,0,0,0,0,0,1,0,0,0
1,0,29/03/2013,11,1335667.000,1,1200,184,0,0,4,...,0,0,0,0,0,0,0,0,0,0
2,0,23/03/2015,11,1335667.000,1,123,0,0,0,4,...,0,0,0,0,0,0,1,0,0,0
3,0,13/07/2015,11,1335667.000,1,102,0,0,0,4,...,0,0,0,0,0,0,1,0,0,0
4,0,17/11/2016,11,1335667.000,1,572,0,0,0,12,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,128439,26/06/2007,40,42475.000,1,106,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1048571,128439,24/10/2007,11,464740.000,1,476,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1048572,128439,06/05/2015,40,42475.000,1,63,0,0,0,4,...,0,0,0,0,0,0,1,0,0,0
1048573,128439,06/05/2015,11,464740.000,1,459,0,0,0,4,...,0,0,0,0,0,0,0,1,0,0


In [79]:
# possible baseline models
# coefficient vs which consumption level the client is 
##

In [80]:
#2# Adjusting the data type:

#A# Changed the type of the categorical data from int64 to categorical:
df_fraud_dummy= df_fraud_dummy.astype({"counter_statue_1": "category","counter_statue_2": "category","counter_statue_3": "category","counter_statue_4": "category","counter_statue_5": "category", "counter_type": 'category', 'reading_remarque_7': 'category', "reading_remarque_8" : "category", "reading_remarque_9" : "category", "client_catg_12" : "category", "client_catg_51" : "category",'target': 'category'})

#B# # Changed invoice_date and creation_date to date formate:
df_fraud_dummy['invoice_date'] = pd.to_datetime(df_fraud_dummy['invoice_date'])
df_fraud_dummy['creation_date'] = pd.to_datetime(df_fraud_dummy['creation_date'])
df_fraud_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 25 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   ID                    1048575 non-null  int64         
 1   invoice_date          1048575 non-null  datetime64[ns]
 2   tarif_type            1048575 non-null  int64         
 3   counter_number        1048575 non-null  float64       
 4   counter_coefficient   1048575 non-null  int64         
 5   consommation_level_1  1048575 non-null  int64         
 6   consommation_level_2  1048575 non-null  int64         
 7   consommation_level_3  1048575 non-null  int64         
 8   consommation_level_4  1048575 non-null  int64         
 9   months_number         1048575 non-null  int64         
 10  counter_type          1048575 non-null  category      
 11  region                1048575 non-null  int64         
 12  creation_date         1048575 non-null  da

In [81]:
# changed value target from 10 to 1 
# changed strings to cate (ELEC= 0 and Gas =1)

df_fraud_dummy["target"].replace(10, 1, inplace=True)
df_fraud_dummy["counter_type"].replace("ELEC", 0 ,inplace=True)
df_fraud_dummy["counter_type"].replace("GAZ", 1 ,inplace=True)


In [82]:
df_cons = df_fraud_dummy.drop(['ID', 'invoice_date', 'counter_number', 'region','creation_date', 'counter_statue_1',
       'counter_statue_2', 'counter_statue_3', 'counter_statue_4',
       'counter_statue_5', 'reading_remarque_7', 'reading_remarque_8',
       'reading_remarque_9', 'client_catg_12' ,"client_catg_51",],axis=1)

In [83]:
df_fraud_dummy.columns

Index(['ID', 'invoice_date', 'tarif_type', 'counter_number',
       'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'months_number',
       'counter_type', 'region', 'creation_date', 'target', 'sum_consu',
       'counter_statue_1', 'counter_statue_2', 'counter_statue_3',
       'counter_statue_4', 'counter_statue_5', 'reading_remarque_7',
       'reading_remarque_8', 'reading_remarque_9', 'client_catg_12',
       'client_catg_51'],
      dtype='object')

In [84]:
column_to_check = 'target'
df_con_filtered = df_cons[df_cons[column_to_check] != 0]


In [85]:
df_fraud_dummy.drop(['ID', 'invoice_date', 'counter_number', 'creation_date'],axis=1 ,inplace=True)

In [86]:
df_fraud_dummy.head()

Unnamed: 0,tarif_type,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,counter_type,region,target,...,counter_statue_1,counter_statue_2,counter_statue_3,counter_statue_4,counter_statue_5,reading_remarque_7,reading_remarque_8,reading_remarque_9,client_catg_12,client_catg_51
0,11,1,82,0,0,0,4,0,101,0,...,0,0,0,0,0,0,1,0,0,0
1,11,1,1200,184,0,0,4,0,101,0,...,0,0,0,0,0,0,0,0,0,0
2,11,1,123,0,0,0,4,0,101,0,...,0,0,0,0,0,0,1,0,0,0
3,11,1,102,0,0,0,4,0,101,0,...,0,0,0,0,0,0,1,0,0,0
4,11,1,572,0,0,0,12,0,101,0,...,0,0,0,0,0,0,0,1,0,0


In [99]:
column_to_check = 'target'
df_filtered = df_fraud_dummy[df_fraud_dummy[column_to_check] != 0]

df_filtered.head()

Unnamed: 0,tarif_type,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,months_number,counter_type,region,target,...,counter_statue_1,counter_statue_2,counter_statue_3,counter_statue_4,counter_statue_5,reading_remarque_7,reading_remarque_8,reading_remarque_9,client_catg_12,client_catg_51
1384,11,1,781,0,0,0,8,0,308,1,...,0,0,0,0,0,0,0,1,0,0
1385,11,1,382,0,0,0,4,0,308,1,...,0,0,0,0,0,0,0,1,0,0
1386,11,1,373,0,0,0,4,0,308,1,...,0,0,0,0,0,0,0,1,0,0
1387,11,1,328,0,0,0,4,0,308,1,...,0,0,0,0,0,0,0,1,0,0
1388,11,1,703,0,0,0,4,0,308,1,...,0,0,0,0,0,0,0,1,0,0


In [88]:
#sns.pairplot(df_con_filtered)

In [89]:

df_org_filtered.nunique()

ID                      1757
invoice_date            3263
tarif_type                10
counter_number          3482
counter_statue             6
reading_remarque           3
counter_coefficient        2
consommation_level_1    2569
consommation_level_2    2733
consommation_level_3     793
consommation_level_4    2262
months_number             33
counter_type               2
client_catg                3
region                    24
creation_date           1343
target                     1
sum_consu               5123
dtype: int64

In [90]:
unique_counts = df_org_filtered['reading_remarque'].value_counts()
unique_counts

reading_remarque
6    38962
9    26089
8    15342
Name: count, dtype: int64

In [91]:
df_org_filtered['reading_remarque'].unique()

array([9, 6, 8], dtype=int64)

In [92]:
# **1. FP-Growth (Frequent Pattern Growth):

# Data Structure: FP-Growth uses a data structure called the FP-Tree (Frequent Pattern Tree) to efficiently mine frequent itemsets.
# Algorithm: It is based on a divide-and-conquer strategy. It recursively builds an FP-Tree and mines frequent itemsets from it.
# Pruning: FP-Growth is known for its efficiency due to its ability to prune the search space effectively. It avoids generating candidate itemsets explicitly.
# Complexity: It is generally more efficient than Apriori for large datasets and high support thresholds because it reduces the need to scan the database multiple times.
# Memory Usage: FP-Growth can be more memory-efficient than Apriori, especially for sparse datasets.
# Advantage: FP-Growth is often preferred when dealing with large datasets or high-dimensional data.

In [104]:
df_filtered.columns


Index(['tarif_type', 'counter_coefficient', 'consommation_level_1',
       'consommation_level_2', 'consommation_level_3', 'consommation_level_4',
       'months_number', 'counter_type', 'region', 'target', 'sum_consu',
       'counter_statue_1', 'counter_statue_2', 'counter_statue_3',
       'counter_statue_4', 'counter_statue_5', 'reading_remarque_7',
       'reading_remarque_8', 'reading_remarque_9', 'client_catg_12',
       'client_catg_51'],
      dtype='object')

In [None]:
df_filtered.drop(['target',]axis=1) 

In [None]:
'consommation_level_1',
       'consommation_level_2', 'consommation_level_3', 'consommation_level_4'

In [None]:
categorical_columns= ['tarif_type', 'counter_coefficient','months_number', 'region']
df_filtered_dummy = pd.get_dummies(df_filtered, columns=categorical_columns, drop_first=True, dtype=int)
df_filtered_dummy

In [None]:
df_filtered.drop(['target',]axis=1) 

In [None]:
column_to_check_eletric = 'counter_type'
df_filtered_eletric = df_filtered[df_filtered[column_to_check_eletric] == 0]
df_filtered_eletric.nunique()

In [None]:
column_to_check_gas = 'counter_type'
df_filtered_gas = df_filtered[df_filtered[column_to_check_gas] == 1]
df_filtered_gas.nunique()

In [None]:
# change numer into cate
data = {'Age': [22, 35, 47, 65, 29, 21, 39, 54, 52, 27]}
df = pd.DataFrame(data)

# Define bin edges and labels
bin_edges = [0, 18, 30, 45, 60, float('inf')]  # Custom bin edges
bin_labels = ['Child', 'Young Adult', 'Middle-Aged', 'Senior', 'Elderly']

# Use pd.cut to create categorical bins
df['Age Category'] = pd.cut(df['Age'], bins=bin_edges, labels=bin_labels)

# Display the DataFrame with categorical values
print(df)

In [None]:
df_fraud_dummy.columns

Index(['ID', 'invoice_date', 'counter_number', 'counter_statue',
       'reading_remarque', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'counter_type',
       ...
       'region_308', 'region_309', 'region_310', 'region_311', 'region_312',
       'region_313', 'region_371', 'region_372', 'region_379', 'region_399'],
      dtype='object', length=544)

In [None]:
#pip install mlxtend
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [None]:
df_fraud_dummy.columns

Index(['ID', 'invoice_date', 'counter_number', 'counter_statue',
       'reading_remarque', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'counter_type',
       ...
       'region_308', 'region_309', 'region_310', 'region_311', 'region_312',
       'region_313', 'region_371', 'region_372', 'region_379', 'region_399'],
      dtype='object', length=544)

In [None]:
tarif_type   (11    44996) / (40    25930)
counter_coefficient of (1 or 10)
avg consumption of 688 ()
number of months before dedection 
months_number
region
counter_statue (how well the counter is working from fine 0 till 5) (0    78513)
reading_remarque 

SyntaxError: invalid syntax. Perhaps you forgot a comma? (930754051.py, line 1)

In [None]:
dataset = [
    ['milk', 'bread', 'nuts'],
    ['milk', 'bread', 'diapers', 'beer'],
    ['milk', 'bread', 'diapers'],
    ['milk', 'bread', 'nuts'],
    ['diapers', 'beer'],
]

# Convert the dataset to a one-hot encoded DataFrame
df = pd.DataFrame(dataset, columns=['item1', 'item2', 'item3', 'item4', 'item5'])
encoded_df = pd.get_dummies(df)





# Use FP-Growth to find frequent itemsets
frequent_itemsets = fpgrowth(encoded_df, min_support=0.4, use_colnames=True)

# Print frequent itemsets
print("Frequent Itemsets:")
print(frequent_itemsets)

# Use Association Rules to generate rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Print association rules
print("\nAssociation Rules:")
print(rules)


ValueError: 5 columns passed, passed data had 4 columns

In [None]:
In logistic regression, if you want to emphasize or give more importance to a specific feature or feature value, you can achieve this by adjusting the feature itself or its representation in the dataset. There are several strategies to do this:

Feature Engineering:

Create new features or modify existing ones to highlight the importance of certain values or ranges. For example, you can create binary indicator variables that represent whether a specific feature value meets a certain condition. This can help the logistic regression model focus on that condition.
Feature Selection:

If you want to focus on a specific feature, you can perform feature selection techniques to choose only that feature or a subset of features. This essentially gives more importance to the selected feature(s) in the modeling process.
Feature Transformation:

Transform the feature in a way that amplifies its impact. For instance, you can apply a non-linear transformation like taking the square or logarithm of the feature to emphasize certain values.
Feature Scaling:

Adjust the scaling of a feature to give more importance to specific value ranges. For example, if values in a feature are between 0 and 1, you can scale them to be between 0 and 10 to magnify their impact.
Sample Weighting:

Assign different weights to individual samples based on the values of a specific feature. Samples with certain feature values can be given higher or lower weights to influence the model's focus.
Interaction Terms:

Create interaction terms between the feature of interest and other features. This can help capture the combined effects of multiple features and their interactions.
Regularization:

Use regularization techniques like L1 (Lasso) or L2 (Ridge) regularization to penalize or encourage specific feature coefficients. Regularization can help control the focus of the model by shrinking or zeroing out less important features.
Domain Knowledge:

Incorporate domain knowledge into your modeling process. If you have expert knowledge that suggests a specific feature or value is critical, ensure that this knowledge is reflected in your feature engineering or preprocessing steps.
The choice of which strategy to use depends on the nature of your data, the specific problem you're trying to solve, and your domain expertise. Be cautious when adjusting feature values or emphasizing specific features, as this can introduce bias into your model if not done carefully. It's essential to evaluate the impact of such adjustments through thorough testing and validation.