# 1. Kütüphanelerin Yüklenmesi

In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

## 2. Veri Setini Yükleme

In [122]:
# Customer Churn verisetini yükleyelim
dataset = load_dataset("d0r1h/customer_churn")
df = dataset["train"].to_pandas()

# Pandas ayarlarını yaparak tüm sütunların görünmesini sağlıyoruz
pd.set_option('display.max_columns', None)

# İlk 10 satıra bakalım
df.head(10)

Unnamed: 0,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,last_visit_time,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,18,F,XW0DQ7H,Village,Platinum Membership,17-08-2017,No,xxxxxxxx,Gift Vouchers/Coupons,?,Wi-Fi,16:08:02,17,300.63,53005.25,17,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,0
1,32,F,5K0N3X1,City,Premium Membership,28-08-2017,?,CID21329,Gift Vouchers/Coupons,Desktop,Mobile_Data,12:38:13,16,306.34,12838.38,10,,Yes,No,Yes,Solved,Quality Customer Care,0
2,44,F,1F2TCL3,Town,No Membership,11-11-2016,Yes,CID12313,Gift Vouchers/Coupons,Desktop,Wi-Fi,22:53:21,14,516.16,21027.0,22,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,1
3,37,M,VJGJ33N,City,No Membership,29-10-2016,Yes,CID3793,Gift Vouchers/Coupons,Desktop,Mobile_Data,15:57:50,11,53.27,25239.56,6,567.66,No,Yes,Yes,Unsolved,Poor Website,1
4,31,F,SVZXCWB,City,No Membership,12-09-2017,No,xxxxxxxx,Credit/Debit Card Offers,Smartphone,Mobile_Data,15:46:44,20,113.13,24483.66,16,663.06,No,Yes,Yes,Solved,Poor Website,1
5,13,M,PSG1LGF,City,Gold Membership,08-01-2016,No,xxxxxxxx,Gift Vouchers/Coupons,?,Wi-Fi,6:46:07,23,433.62,13884.77,24,722.27,Yes,No,Yes,Unsolved,No reason specified,0
6,21,M,R3CX1EA,Town,Gold Membership,19-03-2015,Yes,CID24708,Gift Vouchers/Coupons,Desktop,Mobile_Data,11:40:04,10,55.38,8982.5,28,756.21,Yes,No,Yes,Solved in Follow-up,No reason specified,0
7,42,M,4UJ1551,,No Membership,12-07-2016,?,CID56614,Credit/Debit Card Offers,Both,Fiber_Optic,7:52:43,19,429.11,44554.82,24,568.08,No,Yes,Yes,Unsolved,Poor Product Quality,1
8,44,M,0481QNQ,Village,Silver Membership,14-12-2016,No,xxxxxxxx,Without Offers,Smartphone,Fiber_Optic,6:50:10,15,191.07,18362.31,20,,Yes,No,Yes,Solved in Follow-up,Poor Customer Service,0
9,45,F,ZHP4MCR,Town,No Membership,30-11-2016,No,xxxxxxxx,Gift Vouchers/Coupons,?,Wi-Fi,19:10:16,10,97.31,19244.16,28,706.23,No,Yes,Yes,No Information Available,Poor Customer Service,1


In [123]:
# Veri hakkında bilgi alalım
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           36992 non-null  int64  
 1   gender                        36992 non-null  object 
 2   security_no                   36992 non-null  object 
 3   region_category               31564 non-null  object 
 4   membership_category           36992 non-null  object 
 5   joining_date                  36992 non-null  object 
 6   joined_through_referral       36992 non-null  object 
 7   referral_id                   36992 non-null  object 
 8   preferred_offer_types         36704 non-null  object 
 9   medium_of_operation           36992 non-null  object 
 10  internet_option               36992 non-null  object 
 11  last_visit_time               36992 non-null  object 
 12  days_since_last_login         36992 non-null  int64  
 13  a

## 3. Geçersiz Değerlerin Kontrolü ve Düzeltilmesi

In [124]:
# Analiz için kullanılmayacak sütunları düşürüyoruz
df.drop(['security_no', 'referral_id', 'last_visit_time'], axis=1, inplace=True)

In [125]:
# Veri setindeki 'null' veya '?' gibi geçersiz değerleri 'NaN' ile değiştireceğiz
invalid_values_count = df.isin(['null', '?']).sum().sum()
print(f"Toplam geçersiz değer sayısı: {invalid_values_count}")

Toplam geçersiz değer sayısı: 10831


In [126]:
# 'null' ve '?' değerlerini NaN ile değiştirelim
df.replace({'null': np.nan, '?': np.nan}, inplace=True)

## 4. Eksik Verilerin Kontrolü ve Doldurulması

In [127]:
# Eksik değerlerin hangi sütunlarda olduğunu kontrol edelim
df.isnull().sum()

age                                0
gender                             0
region_category                 5428
membership_category                0
joining_date                       0
joined_through_referral         5438
preferred_offer_types            288
medium_of_operation             5393
internet_option                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
avg_frequency_login_days           0
points_in_wallet                3443
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
dtype: int64

In [128]:
# Eksik verileri, ilgili sütunun mod veya medyanı ile dolduruyoruz
df['region_category'] = df['region_category'].fillna(df['region_category'].mode()[0])
df['joined_through_referral'] = df['joined_through_referral'].fillna(df['joined_through_referral'].mode()[0])
df['preferred_offer_types'] = df['preferred_offer_types'].fillna(df['preferred_offer_types'].mode()[0])
df['medium_of_operation'] = df['medium_of_operation'].fillna(df['medium_of_operation'].mode()[0])
df['points_in_wallet'] = df['points_in_wallet'].fillna(df['points_in_wallet'].median())

## 5.. Tarih Formatı ve Kodlama

In [129]:
# Tarih sütununu doğru formatta datetime türüne çeviriyoruz
df['joining_date'] = pd.to_datetime(df['joining_date'], format='%d-%m-%Y')

In [130]:
# 'joined_through_referral' sütununu binary olarak kodluyoruz
df['joined_through_referral'] = df['joined_through_referral'].map({'Yes': 1, 'No': 0})

In [131]:
# 'avg_frequency_login_days' sütununu sayısal değerlere dönüştürüyoruz
df['avg_frequency_login_days'] = pd.to_numeric(df['avg_frequency_login_days'], errors='coerce')

# Eksik olan 'avg_frequency_login_days' verisini medyan ile dolduruyoruz
df['avg_frequency_login_days'] = df['avg_frequency_login_days'].fillna(df['avg_frequency_login_days'].median())

# Veri tipi değişikliklerinin sonucunu kontrol ediyoruz

In [132]:
df.isnull().sum()

age                             0
gender                          0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

## 6. Kategorik Verilerin İncelenmesi

In [133]:
# Kategorik sütunların benzersiz değerlerini görmek için fonksiyon yazıyoruz
def print_unique_col_values(df):
    for column in df:
        if df[column].dtype == 'object':  # Sadece kategorik sütunlar
            print(f'{column}:')
            unique_values = df[column].unique()  # Benzersiz değerler
            for i, value in enumerate(unique_values, start=1):
                print(f'{i}. {value}')
            print()  # Sütunlar arasına boşluk eklemek için

print_unique_col_values(df)

gender:
1. F
2. M
3. Unknown

region_category:
1. Village
2. City
3. Town

membership_category:
1. Platinum Membership
2. Premium Membership
3. No Membership
4. Gold Membership
5. Silver Membership
6. Basic Membership

preferred_offer_types:
1. Gift Vouchers/Coupons
2. Credit/Debit Card Offers
3. Without Offers

medium_of_operation:
1. Desktop
2. Smartphone
3. Both

internet_option:
1. Wi-Fi
2. Mobile_Data
3. Fiber_Optic

used_special_discount:
1. Yes
2. No

offer_application_preference:
1. Yes
2. No

past_complaint:
1. No
2. Yes

complaint_status:
1. Not Applicable
2. Solved
3. Solved in Follow-up
4. Unsolved
5. No Information Available

feedback:
1. Products always in Stock
2. Quality Customer Care
3. Poor Website
4. No reason specified
5. Poor Product Quality
6. Poor Customer Service
7. Too many ads
8. User Friendly Website
9. Reasonable Price



## 7. Kategorik Verilerin Sayısal Verilere Dönüştürülmesi

In [134]:
# Kategorik verileri sayısal verilere dönüştürüyoruz
df['gender'] = df['gender'].map({'F': 0, 'M': 1, 'Unknown': 2})

df['region_category'] = df['region_category'].map({'Town': 0, 'City': 1, 'Village': 2})

df['medium_of_operation'] = df['medium_of_operation'].map({'Desktop': 0, 'Smartphone': 1, 'Both': 2})

df['internet_option'] = df['internet_option'].map({'Wi-Fi': 0, 'Mobile_Data': 1, 'Fiber_Optic': 2})

df['used_special_discount'] = df['used_special_discount'].map({'No': 0, 'Yes': 1})

df['offer_application_preference'] = df['offer_application_preference'].map({'No': 0, 'Yes': 1})

df['past_complaint'] = df['past_complaint'].map({'No': 0, 'Yes': 1})

In [135]:
df['membership_category'] = df['membership_category'].map({
    'No Membership': 0, 
    'Gold Membership': 1, 
    'Basic Membership': 2, 
    'Silver Membership': 3, 
    'Premium Membership': 4, 
    'Platinum Membership': 5
})

In [136]:
df['preferred_offer_types'] = df['preferred_offer_types'].map({
    'Gift Vouchers/Coupons': 0, 
    'Credit/Debit Card Offers': 1, 
    'Without Offers': 2
})

In [137]:
df['complaint_status'] = df['complaint_status'].map({
    'Solved in Follow-up': 0, 
    'Unsolved': 1, 
    'Solved': 2, 
    'Not Applicable': 3, 
    'No Information Available': 4
})

In [138]:
# Olumlu ve olumsuz geri bildirimleri gruplandırıyoruz
positive_feedback = ['Quality Customer Care', 'User Friendly Website', 'Reasonable Price']
negative_feedback = ['Poor Website', 'Poor Product Quality', 'Too many ads', 'Poor Customer Service']

# 'feedback' sütununu sayısal olarak dönüştürme 
df['feedback'] = df['feedback'].apply(
    lambda x: 1 if x in positive_feedback else (0 if x in negative_feedback else -1)
)

In [139]:
df.head(10)

Unnamed: 0,age,gender,region_category,membership_category,joining_date,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,18,0,2,5,2017-08-17,0,0,0,0,17,300.63,53005.25,17.0,781.75,1,1,0,3,-1,0
1,32,0,1,4,2017-08-28,0,0,0,1,16,306.34,12838.38,10.0,697.62,1,0,1,2,1,0
2,44,0,0,0,2016-11-11,1,0,0,0,14,516.16,21027.0,22.0,500.69,0,1,1,0,0,1
3,37,1,1,0,2016-10-29,1,0,0,1,11,53.27,25239.56,6.0,567.66,0,1,1,1,0,1
4,31,0,1,0,2017-09-12,0,1,1,1,20,113.13,24483.66,16.0,663.06,0,1,1,2,0,1
5,13,1,1,1,2016-01-08,0,0,0,0,23,433.62,13884.77,24.0,722.27,1,0,1,1,-1,0
6,21,1,0,1,2015-03-19,1,0,0,1,10,55.38,8982.5,28.0,756.21,1,0,1,0,-1,0
7,42,1,0,0,2016-07-12,0,1,2,2,19,429.11,44554.82,24.0,568.08,0,1,1,1,0,1
8,44,1,2,3,2016-12-14,0,2,1,2,15,191.07,18362.31,20.0,697.62,1,0,1,0,0,0
9,45,0,0,0,2016-11-30,0,0,0,0,10,97.31,19244.16,28.0,706.23,0,1,1,4,0,1


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   age                           36992 non-null  int64         
 1   gender                        36992 non-null  int64         
 2   region_category               36992 non-null  int64         
 3   membership_category           36992 non-null  int64         
 4   joining_date                  36992 non-null  datetime64[ns]
 5   joined_through_referral       36992 non-null  int64         
 6   preferred_offer_types         36992 non-null  int64         
 7   medium_of_operation           36992 non-null  int64         
 8   internet_option               36992 non-null  int64         
 9   days_since_last_login         36992 non-null  int64         
 10  avg_time_spent                36992 non-null  float64       
 11  avg_transaction_value       

In [141]:
# Veriyi belirtilen dosya yoluna kaydetme
df.to_csv(r'C:\Users\90535\Desktop\Ali Murat\ÇAŞLIŞMA\processed_data.csv', index=False)