In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

## VERİ SETİ YÜKLEME

In [30]:
train = pd.read_csv("train_dataset.csv")
test = pd.read_csv("submission_data_x.csv")

train.head()

Unnamed: 0,ID,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,Nbr_of_prod_purchas17,Nbr_of_prod_purchas18,Nbr_of_prod_purchas19,Nbr_of_prod_purchas20,Nbr_of_prod_purchas21,Nbr_of_prod_purchas22,Nbr_of_prod_purchas23,Nbr_of_prod_purchas24,Nb_of_items,fraud_flag
0,79815,COMPUTER PERIPHERALS ACCESSORIES,,,,,,,,,...,,,,,,,,,1,0
1,22598,BEDROOM FURNITURE,BEDROOM FURNITURE,SERVICE,SERVICE,,,,,,...,,,,,,,,,4,0
2,63665,LIVING DINING FURNITURE,,,,,,,,,...,,,,,,,,,1,0
3,31312,COMPUTERS,,,,,,,,,...,,,,,,,,,1,0
4,30742,COMPUTERS,,,,,,,,,...,,,,,,,,,1,0


In [31]:
test.head()

Unnamed: 0,ID,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,Nbr_of_prod_purchas16,Nbr_of_prod_purchas17,Nbr_of_prod_purchas18,Nbr_of_prod_purchas19,Nbr_of_prod_purchas20,Nbr_of_prod_purchas21,Nbr_of_prod_purchas22,Nbr_of_prod_purchas23,Nbr_of_prod_purchas24,Nb_of_items
0,38100,TELEVISIONS HOME CINEMA,TELEVISIONS HOME CINEMA,AUDIO ACCESSORIES,,,,,,,...,,,,,,,,,,3
1,13409,BEDROOM FURNITURE,BEDROOM FURNITURE,,,,,,,,...,,,,,,,,,,2
2,56447,TELEVISIONS HOME CINEMA,FULFILMENT CHARGE,,,,,,,,...,,,,,,,,,,2
3,70271,COMPUTERS,,,,,,,,,...,,,,,,,,,,1
4,11531,COMPUTERS,COMPUTER SOFTWARE,FULFILMENT CHARGE,,,,,,,...,,,,,,,,,,3


## VERİ İNCELEME:

#### Veri

In [32]:
print("info:",train.info(),"\n") #dtypes: float64(47), int64(5), object(95)
print("shape:",train.shape,"\n") #shape: (74230, 147)
print("NaN values:",train.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74230 entries, 0 to 74229
Columns: 147 entries, ID to fraud_flag
dtypes: float64(47), int64(5), object(95)
memory usage: 83.3+ MB
info: None 

shape: (74230, 147) 

NaN values: ID                           0
item1                        0
item2                    38451
item3                    63919
item4                    70554
                         ...  
Nbr_of_prod_purchas22    74184
Nbr_of_prod_purchas23    74194
Nbr_of_prod_purchas24    74200
Nb_of_items                  0
fraud_flag                   0
Length: 147, dtype: int64


-For each line of the dataset there are 147 columns, in which 144 are grouped in 6 categories:
• item,
• cash_price,
• make,
• model,
• goods_code,
• Nbr_of_prod_purchas

-For each of these categories there are 24 instances that will either be filled with relevant information when an item exists in the basket, or null when it does not.

For example, if an application has 3 items in the basket there will be information in the columns item1 to item3, cash_price1 to cash_price3, make1 to make3, model1 to model3, goods_code1 to goods_code3 and Nbr_of_prod_purchas1 to Nbr_of_pro_purchas3; but the remaining columns of these categories will be null.

Çok fazla feature olduğu için şu an feature selection vs yapmayacağım. Önceliğim doldularabildiğim boşlukları doldurmak eleyebildiğim satırları elemek ve train etmeye hazır bir veri oluşturmak.

#### Boşluk Doldurma:

In [33]:
train.select_dtypes(include = ['float']).head() 

Unnamed: 0,cash_price2,cash_price3,cash_price4,cash_price5,cash_price6,cash_price7,cash_price8,cash_price9,cash_price10,cash_price11,...,Nbr_of_prod_purchas15,Nbr_of_prod_purchas16,Nbr_of_prod_purchas17,Nbr_of_prod_purchas18,Nbr_of_prod_purchas19,Nbr_of_prod_purchas20,Nbr_of_prod_purchas21,Nbr_of_prod_purchas22,Nbr_of_prod_purchas23,Nbr_of_prod_purchas24
0,,,,,,,,,,,...,,,,,,,,,,
1,550.0,30.0,0.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [34]:
train.select_dtypes(include = ['int']).head()

Unnamed: 0,ID,cash_price1,Nbr_of_prod_purchas1,Nb_of_items,fraud_flag
0,79815,369,1,1,0
1,22598,839,1,4,0
2,63665,4099,1,1,0
3,31312,1149,1,1,0
4,30742,1187,1,1,0


In [35]:
train.select_dtypes(include = ['object']).head()

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,item8,item9,item10,...,goods_code14,goods_code15,goods_code16,goods_code17,goods_code18,goods_code19,goods_code20,goods_code21,goods_code22,goods_code23
0,COMPUTER PERIPHERALS ACCESSORIES,,,,,,,,,,...,,,,,,,,,,
1,BEDROOM FURNITURE,BEDROOM FURNITURE,SERVICE,SERVICE,,,,,,,...,,,,,,,,,,
2,LIVING DINING FURNITURE,,,,,,,,,,...,,,,,,,,,,
3,COMPUTERS,,,,,,,,,,...,,,,,,,,,,
4,COMPUTERS,,,,,,,,,,...,,,,,,,,,,


#### Dimension Azaltma ve Boşluk Doldurma

In [36]:
train["item1"].value_counts()

COMPUTERS                                    37942
TELEVISIONS HOME CINEMA                      10734
COMPUTER PERIPHERALS ACCESSORIES              9459
LIVING DINING FURNITURE                       3313
TELEPHONES, FAX MACHINES & TWO-WAY RADIOS     2318
                                             ...  
MEN S ACCESSORIES                                1
TOSHIBA PORTABLE HARD DRIVE                      1
JEWELLERY & WATCHES                              1
MENS UNDERWEAR & SOCKS                           1
BLANK MEDIA & MEDIA STORAGE                      1
Name: item1, Length: 128, dtype: int64

In [37]:
# train-test setinde aynı değerler farklı int değer almasın diye train test birleştirip laber encoder yapacağım.
birlesik_veri = pd.concat([train, test], axis=0)
#birleşecek kolonlar
birlesik_item = birlesik_veri.iloc[:,1:25]
birlesik_cash = birlesik_veri.iloc[:,25:49]
birlesik_make = birlesik_veri.iloc[:,49:73]
birlesik_model = birlesik_veri.iloc[:,73:97]
birlesik_gc = birlesik_veri.iloc[:,97:121]
birlesik_nbr = birlesik_veri.iloc[:,121:145]

#kolonları azaltmak için toplamlarını alacağım ve standard scaler uygulayacağım

label_encoder = LabelEncoder()
def go(data,var):
    data = data.astype(str)
    data.fillna("0",inplace=True)
    data = data.apply(label_encoder.fit_transform)
    data = data.astype(int)
    data[var] = data.sum(axis=1)
    data = data.drop(data.iloc[:,0:24],axis=1)
    return data

#kolonlar:
item = go(birlesik_item,"item")
cash = go(birlesik_cash,"cash")
make = go(birlesik_make,"make")
model = go(birlesik_model,"model")
gc = go(birlesik_gc,"gc")
nbr = go(birlesik_gc,"nbr")

In [38]:
#Birleştirelim
id = birlesik_veri.drop(birlesik_veri.iloc[:,1:],axis=1)
fraud_flag = birlesik_veri.drop(birlesik_veri.iloc[:,0:146],axis=1)
Nb_of_items = birlesik_veri.iloc[:,145]
yeni_veri = pd.concat([item, cash, make, model, gc, nbr,Nb_of_items],axis=1, ignore_index=True)
yeni_veri.columns = ["item","cash","make","model","gc","nbr","Nb_of_items"]
#Standard-Scaler
scaler = StandardScaler()
yeni_veri = scaler.fit_transform(yeni_veri)
yeni_veri = pd.DataFrame(yeni_veri)
yeni_veri.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.31,0.65,0.2,0.5,0.92,0.92,-0.52
1,-1.5,-0.35,-0.53,-0.82,-2.41,-2.41,1.53
2,0.79,0.72,1.34,1.18,-0.92,-0.92,-0.52
3,0.33,-0.6,0.2,0.25,0.37,0.37,-0.52
4,0.33,-0.56,0.2,0.25,0.37,0.37,-0.52


In [39]:
#id ve fraud_flag ekleme:
yeni_veri.reset_index(inplace=True, drop=True)
id.reset_index(inplace=True, drop=True)
fraud_flag.reset_index(inplace=True, drop=True)
son_veri = pd.concat([id,yeni_veri,fraud_flag],axis=1,ignore_index=True)
son_veri.columns = ["ID","item","cash","make","model","gc","nbr","Nb_of_items","fraud_flag"]
son_veri.head()

Unnamed: 0,ID,item,cash,make,model,gc,nbr,Nb_of_items,fraud_flag
0,79815,0.31,0.65,0.2,0.5,0.92,0.92,-0.52,0.0
1,22598,-1.5,-0.35,-0.53,-0.82,-2.41,-2.41,1.53,0.0
2,63665,0.79,0.72,1.34,1.18,-0.92,-0.92,-0.52,0.0
3,31312,0.33,-0.6,0.2,0.25,0.37,0.37,-0.52,0.0
4,30742,0.33,-0.56,0.2,0.25,0.37,0.37,-0.52,0.0


In [40]:
#train-test setlerini tekrardan ayıralım
train_new = son_veri.iloc[:len(train), :]
test_new = son_veri.iloc[len(train):, :]
#test den fraud kolonunu çıkaralım
test_new = test_new.drop(columns=["fraud_flag"],axis=1)

#### MODEL

In [41]:
train_new.head()

Unnamed: 0,ID,item,cash,make,model,gc,nbr,Nb_of_items,fraud_flag
0,79815,0.31,0.65,0.2,0.5,0.92,0.92,-0.52,0.0
1,22598,-1.5,-0.35,-0.53,-0.82,-2.41,-2.41,1.53,0.0
2,63665,0.79,0.72,1.34,1.18,-0.92,-0.92,-0.52,0.0
3,31312,0.33,-0.6,0.2,0.25,0.37,0.37,-0.52,0.0
4,30742,0.33,-0.56,0.2,0.25,0.37,0.37,-0.52,0.0


In [42]:
test_new.head()

Unnamed: 0,ID,item,cash,make,model,gc,nbr,Nb_of_items
74230,38100,-0.21,0.92,-0.65,-0.91,-0.33,-0.33,0.85
74231,13409,-0.98,-0.29,0.27,0.17,-1.84,-1.84,0.16
74232,56447,0.31,-1.36,0.47,-0.01,0.48,0.48,0.16
74233,70271,0.33,-0.66,0.2,0.26,0.66,0.66,-0.52
74234,11531,-1.48,-1.73,-1.0,-1.36,-0.44,-0.44,0.85


In [43]:
train_new = train_new.drop(columns=["ID"],axis=1)
y = train_new["fraud_flag"]
X = train_new.drop(columns=["fraud_flag"],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=12)

cls = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None )
models,predictions = cls.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/29 [00:00<?, ?it/s]

 97%|█████████▋| 28/29 [01:56<00:03,  3.82s/it]

[LightGBM] [Info] Number of positive: 849, number of negative: 58535
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1444
[LightGBM] [Info] Number of data points in the train set: 59384, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014297 -> initscore=-4.233321
[LightGBM] [Info] Start training from score -4.233321


100%|██████████| 29/29 [01:57<00:00,  4.05s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.61,0.65,0.65,0.74,0.12
RandomForestClassifier,0.99,0.56,0.56,0.98,7.39
ExtraTreeClassifier,0.98,0.55,0.55,0.98,0.14
ExtraTreesClassifier,0.99,0.55,0.55,0.98,4.83
KNeighborsClassifier,0.99,0.55,0.55,0.98,1.47
BaggingClassifier,0.99,0.55,0.55,0.98,2.53
DecisionTreeClassifier,0.98,0.54,0.54,0.98,0.38
XGBClassifier,0.99,0.53,0.53,0.98,1.73
LGBMClassifier,0.99,0.52,0.52,0.98,0.86
PassiveAggressiveClassifier,0.99,0.5,0.5,0.98,0.15


In [44]:
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report

model = NearestCentroid()
model.fit(X_train, y_train)

print(f"Training Set Score : {model.score(X_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(X_test, y_test) * 100} %")

print(f"Model Classification Report : \n{classification_report(y_test, model.predict(X_test))}")


Training Set Score : 59.87302977232925 %
Test Set Score : 60.864879428802375 %
Model Classification Report : 
              precision    recall  f1-score   support

         0.0       0.99      0.61      0.75     14641
         1.0       0.02      0.70      0.05       205

    accuracy                           0.61     14846
   macro avg       0.51      0.65      0.40     14846
weighted avg       0.98      0.61      0.74     14846



In [45]:
test_id = test_new["ID"]
test_new = test_new.drop(columns=['ID'])
test_new.head()

Unnamed: 0,item,cash,make,model,gc,nbr,Nb_of_items
74230,-0.21,0.92,-0.65,-0.91,-0.33,-0.33,0.85
74231,-0.98,-0.29,0.27,0.17,-1.84,-1.84,0.16
74232,0.31,-1.36,0.47,-0.01,0.48,0.48,0.16
74233,0.33,-0.66,0.2,0.26,0.66,0.66,-0.52
74234,-1.48,-1.73,-1.0,-1.36,-0.44,-0.44,0.85


In [46]:
# %60 sonuç aldığım model için olasılıkların çıktısı:
y_pred = model.predict(test_new)
from sklearn.metrics.pairwise import euclidean_distances

# Merkez uzaklığına bağlı olasılık çıkartma:
class_centers = model.centroids_
distances = euclidean_distances(test_new, class_centers)
max_distance = np.max(distances)
probabilities = (max_distance - distances) / max_distance

predicted_classes = probabilities.argmax(axis=1)

max_probabilities = probabilities.max(axis=1)

predictions_df = pd.DataFrame({'Max_Probability': max_probabilities})
predictions_df.head()

Unnamed: 0,Max_Probability
0,0.94
1,0.9
2,0.95
3,0.96
4,0.9


In [47]:
test_id = pd.DataFrame(test_id,columns=["ID"])
test_id = test_id.reset_index(drop=True)

sub_df = pd.concat([test_id,predictions_df],axis=1,ignore_index=True)
sub_df = sub_df.reset_index(drop=True)
sub_df.columns = ["ID","fraud_flag"]
sub_df

Unnamed: 0,ID,fraud_flag
0,38100,0.94
1,13409,0.90
2,56447,0.95
3,70271,0.96
4,11531,0.90
...,...,...
18553,16785,0.93
18554,11514,0.78
18555,3012,0.98
18556,96385,0.89


In [48]:
# sub_df.to_csv("submission.csv",index=False)