In [16]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer

In [18]:
import warnings
import sklearn
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sklearn.config_context(transform_output="pandas")

<contextlib._GeneratorContextManager at 0x2226029e5c0>

In [19]:
train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)
submission = pd.read_csv('sample_submission.csv', low_memory=False)

In [20]:
encoding_map = {'UA': -1, 'HU06': 0, 'HU07': 1, 'HU11': 2,'HU12': 3, 'HU14': 4, 'HU15': 5, 'HU19': 6}
train["LABEL"] = train["LABEL"].map(encoding_map).astype(int)

In [21]:
all_data = pd.concat([train, test], axis=0, ignore_index=True)

Türkiye'deki şehirleri ve bunların ait olduğu bölgeleri içeren bir sözlük oluşturduk.

In [22]:
city_to_region = {
    0: "Yurtdisi", 1: "Akdeniz", 2: "Marmara", 3: "Ege", 4: "Doğu Anadolu", 5: "Akdeniz",
    6: "İç Anadolu", 7: "Akdeniz", 8: "Karadeniz", 9: "Ege", 10: "Ege",
    11: "İç Anadolu", 12: "Karadeniz", 13: "Marmara", 14: "Karadeniz", 15: "Marmara",
    16: "Marmara", 17: "Karadeniz", 18: "İç Anadolu", 19: "Karadeniz", 20: "Ege",
    21: "Doğu Anadolu", 22: "Ege", 23: "Doğu Anadolu", 24: "Doğu Anadolu", 25: "Doğu Anadolu",
    26: "Ege", 27: "Akdeniz", 28: "Marmara", 29: "Karadeniz", 30: "Akdeniz",
    31: "Akdeniz", 32: "Akdeniz", 33: "Marmara", 34: "Marmara", 35: "Ege",
    36: "Doğu Anadolu", 37: "Karadeniz", 38: "İç Anadolu", 39: "Marmara", 40: "İç Anadolu",
    41: "Marmara", 42: "İç Anadolu", 43: "Ege", 44: "Doğu Anadolu", 45: "Ege",
    46: "Akdeniz", 47: "Güneydoğu Anadolu", 48: "Ege", 49: "Doğu Anadolu", 50: "İç Anadolu",
    51: "Doğu Anadolu", 52: "Karadeniz", 53: "Karadeniz", 54: "Marmara", 55: "Karadeniz",
    56: "Güneydoğu Anadolu", 57: "Karadeniz", 58: "İç Anadolu", 59: "Marmara", 60: "Karadeniz",
    61: "Karadeniz", 62: "Güneydoğu Anadolu", 63: "Güneydoğu Anadolu", 64: "Ege",
    65: "Doğu Anadolu", 66: "İç Anadolu", 67: "Karadeniz", 68: "İç Anadolu", 69: "Doğu Anadolu",
    70: "Akdeniz", 71: "İç Anadolu", 72: "Güneydoğu Anadolu", 73: "Doğu Anadolu",
    74: "Karadeniz", 75: "Doğu Anadolu", 76: "Doğu Anadolu", 77: "Marmara", 78: "Karadeniz",
    79: "Doğu Anadolu", 80: "Akdeniz", 81: "Karadeniz"
}

all_data['IL'] = all_data['IL'].fillna(34).astype("category")
all_data['Regions'] = all_data['IL'].map(city_to_region)

In [23]:
month_mapping = {
    'OCAK': 1,
    'SUBAT': 2,
    'MART': 3,
    'NISAN': 4,
    'MAYIS': 5,
    'HAZIRAN': 6,
    'TEMMUZ': 7,
    'AGUSTOS': 8,
    'EYLUL': 9,
    'EKIM': 10,
    'KASIM': 11,
    'ARALIK': 12
    }

# FLAG sütununu map fonksiyonu ile encode edin
all_data['FLAG'] = all_data['FLAG'].map(month_mapping).astype(int)
all_data.sort_values(by='FLAG', ascending=True, inplace=True)
all_data.reset_index(drop=True, inplace=True)

In [24]:
all_data["SORU_GELIR_CVP"] = all_data["SORU_GELIR_CVP"].apply(lambda x: str(x).replace(",", ".")).astype(float)

In [25]:
cat_cols = ['PP_UYRUK', 'SORU_MEDENI_HAL_CVP', 'SORU_YATIRIM_KARAKTERI_CVP', 'SORU_EGITIM_CVP', 'PP_MUSTERI_SEGMENTI', 'Regions', 'PP_MESLEK']

num_cols = ['PP_YAS', 'SORU_YATIRIM_KARAKTERI_RG', 'SORU_EGITIM_RG', 'SORU_GELIR_CVP', 'SORU_GELIR_RG', 'SORU_COCUK_SAYISI_RG', 
            'SORU_MEDENI_HAL_RG', 'SORU_COCUK_SAYISI_CVP','BES_AYRILMA_TALEP_ADET', 'ODEMEME_TALEP_ADET', 'HAYAT_AYRILMA_TALEP_ADET', 
            'BILGI_TALEP_ADET', 'VADE_TUTAR_0', 'ODEME_TUTAR_0', 'VADE_TUTAR_1', 'ODEME_TUTAR_1', 'VADE_TUTAR_2', 'ODEME_TUTAR_2', 
            'VADE_TUTAR_3', 'ODEME_TUTAR_3', 'VADE_TUTAR_4', 'ODEME_TUTAR_4', 'VADE_TUTAR_5', 'ODEME_TUTAR_5', 'VADE_TUTAR_6', 'ODEME_TUTAR_6', 
            'VADE_TUTAR_7', 'ODEME_TUTAR_7', 'VADE_TUTAR_8', 'ODEME_TUTAR_8', 'VADE_TUTAR_9', 'ODEME_TUTAR_9', 'VADE_TUTAR_10','ODEME_TUTAR_10', 
            'VADE_TUTAR_11', 'ODEME_TUTAR_11', 'SON_AY_KATKI_MIKTARI', 'SON_AY_KATKI_ADET', 'SON_CEYREK_KATKI_MIKTARI','SON_CEYREK_KATKI_ADET', 
            'SON_SENE_KATKI_MIKTARI', 'SON_SENE_KATKI_ADET', 'ANAPARA', 'GETIRI', 'BU01', 'BU02', 'BU03', 'BU04', 'BU05', 'BU06', 'BU07', 
            'BU08', 'BU09', 'BU10', 'BU11', 'BU12', 'BU13', 'BU14', 'BU15', 'BU16', 'BU17', 'BU18', 'BU19', 'BU20', 'BU21','BU22', 'BU23', 
            'BU24', 'HU01', 'HU02', 'HU03', 'HU04', 'HU05', 'HU06', 'HU07', 'HU10', 'HU11', 'HU12', 'HU13', 'HU14', 'HU15', 'HU16', 'HU17',
            'HU18', 'HU19', 'AKTIF_ILK_POLICE_RG']

In [26]:
all_data[cat_cols] = all_data[cat_cols].astype(object)

In [27]:
cat_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')
num_imputer = SimpleImputer(strategy='constant', fill_value=0)

In [28]:
all_data[num_cols] = num_imputer.fit_transform(all_data[num_cols])
all_data[cat_cols] = cat_imputer.fit_transform(all_data[cat_cols])

In [29]:
def rare_encoder(dataframe, cat_cols, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in cat_cols if (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df

all_data = rare_encoder(all_data, cat_cols, 0.005)

In [30]:
cat_cols.remove('PP_MESLEK')
all_data_final = pd.concat([all_data.drop(columns=cat_cols), pd.get_dummies(all_data[cat_cols])], axis=1)

`Meslek` özelliğine target encoding uyguladık.

In [31]:
from category_encoders import TargetEncoder

# Assuming 'all_data_final' is your DataFrame and it's already loaded
y = all_data_final['LABEL']  # Target variable

# Initialize the target encoder
encoder = TargetEncoder()

# Apply TargetEncoder to each column in cat_but_car list and replace the original column with encoded values
all_data_final['PP_MESLEK'] = encoder.fit_transform(all_data_final['PP_MESLEK'], y)

#### PCA

In [32]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming 'all_data_final' is your DataFrame

# Extracting HU1 to HU19 features
hu_features = all_data_final.filter(regex='^HU')

# Identifying BU features by filtering column names that start with 'BU'
bu_features = all_data_final.filter(regex='^BU')

# Combining HU and BU features
combined_binary_features = pd.concat([hu_features, bu_features], axis=1)

scaler = StandardScaler()
combined_binary_features_scaled = scaler.fit_transform(combined_binary_features)

# Initialize PCA, specifying we want to keep enough components to explain 95% of the variance
pca = PCA(n_components=2)

# Fit and transform the combined binary features
combined_binary_features_pca = pca.fit_transform(combined_binary_features_scaled)

# Creating new feature names for the principal components
new_feature_names = ['PC_1', 'PC_2']

# Adding the principal components as new features to the original DataFrame
all_data_final[new_feature_names] = combined_binary_features_pca

In [33]:
train_preped = all_data_final.iloc[:train.shape[0]]
test_preped = all_data_final.iloc[train.shape[0]:]

In [34]:
train_preped.to_csv('data/train_preped.csv', index=False)
test_preped.to_csv('data/test_preped.csv', index=False)

In [35]:
train_preped.LABEL.value_counts(normalize=True)

LABEL
-1.0    0.976605
 4.0    0.012611
 1.0    0.004372
 0.0    0.003727
 6.0    0.000890
 3.0    0.000793
 2.0    0.000515
 5.0    0.000487
Name: proportion, dtype: float64