In [11]:
import openpyxl
import pandas as pd


def read_excel(filename, nrows):
    """Read out a subset of rows from the first worksheet of an excel workbook.

    This function will not load more excel rows than necessary into memory, and is 
    therefore well suited for very large excel files.

    Parameters
    ----------
    filename : str or file-like object
        Path to excel file.
    nrows : int
        Number of rows to parse (starting at the top).

    Returns
    -------
    pd.DataFrame
        Column labels are constructed from the first row of the excel worksheet.

    """
    # Parameter `read_only=True` leads to excel rows only being loaded as-needed
    book = openpyxl.load_workbook(filename=filename, read_only=True, data_only=True)
    first_sheet = book.worksheets[0]
    rows_generator = first_sheet.values

    header_row = next(rows_generator)
    data_rows = [row for (_, row) in zip(range(nrows - 1), rows_generator)]
    return pd.DataFrame(data_rows, columns=header_row)


data = read_excel('turknetchurnekimanonim.xlsx', nrows=2000)
data.head()

Unnamed: 0,NaN,ILTELKODU,ILADI,POSILCE,ILCE,CINSIYET,DURUM,ABONEBAS,CLOSE_DATE,KALDIGI_AY_SAYISI,...,DENVERGIRISTARIHI_3,DENVERCIKISTARIHI_3,DENVERKALDIGISURE_GUN_3,IKNATICKET_3,PORTERROR_SAYISI_3,MAX_SESSIONTIME_3,MIN_SESSIONTIME_3,TOTALUPLOADGB_3,TOTALDOWNLOADGB_3,CHURNTEST_DATE
0,0,212,İstanbul (Avrupa),939,SULTANGAZİ,E,A,2016-04-20,NaT,54,...,,,0,0,10,87108,0,16.5013,272.1967,2020-10-31
1,1,212,İstanbul (Avrupa),420,SARIYER,K,A,2016-04-20,NaT,54,...,,,0,0,1,90271,0,135.8656,185.5825,2020-10-31
2,2,262,Kocaeli,998,İZMİT,E,A,2016-04-21,NaT,54,...,,,0,0,3,90112,0,1.8255,46.6458,2020-10-31
3,3,212,İstanbul (Avrupa),418,KÜÇÜKÇEKMECE,E,A,2016-04-21,NaT,54,...,,,0,0,15,87588,1664,2.0288,42.0181,2020-10-31
4,4,212,İstanbul (Avrupa),425,BAYRAMPAŞA,E,A,2016-04-21,2020-10-02,54,...,,,0,0,0,0,0,0.0,0.0,2020-10-31


In [12]:
len(list(data))

125

In [13]:
data['DURUM'].value_counts()

A    1935
K      64
Name: DURUM, dtype: int64

In [14]:
# Assign outcome as 0 if DURUM=A and 1 if DURUM=K
data['DURUM'] = [0 if x == 'A' else 1 for x in data['DURUM']]
y = data['DURUM']
x = data.drop('DURUM',1)

In [15]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: DURUM, dtype: int64

In [16]:
x.head()

Unnamed: 0,NaN,ILTELKODU,ILADI,POSILCE,ILCE,CINSIYET,ABONEBAS,CLOSE_DATE,KALDIGI_AY_SAYISI,ODEMESEKLI,...,DENVERGIRISTARIHI_3,DENVERCIKISTARIHI_3,DENVERKALDIGISURE_GUN_3,IKNATICKET_3,PORTERROR_SAYISI_3,MAX_SESSIONTIME_3,MIN_SESSIONTIME_3,TOTALUPLOADGB_3,TOTALDOWNLOADGB_3,CHURNTEST_DATE
0,0,212,İstanbul (Avrupa),939,SULTANGAZİ,E,2016-04-20,NaT,54,K,...,,,0,0,10,87108,0,16.5013,272.1967,2020-10-31
1,1,212,İstanbul (Avrupa),420,SARIYER,K,2016-04-20,NaT,54,K,...,,,0,0,1,90271,0,135.8656,185.5825,2020-10-31
2,2,262,Kocaeli,998,İZMİT,E,2016-04-21,NaT,54,K,...,,,0,0,3,90112,0,1.8255,46.6458,2020-10-31
3,3,212,İstanbul (Avrupa),418,KÜÇÜKÇEKMECE,E,2016-04-21,NaT,54,B,...,,,0,0,15,87588,1664,2.0288,42.0181,2020-10-31
4,4,212,İstanbul (Avrupa),425,BAYRAMPAŞA,E,2016-04-21,2020-10-02,54,K,...,,,0,0,0,0,0,0.0,0.0,2020-10-31


In [17]:
# decide which categorical variables you want to use in model
for col_name in x.columns:
    if x[col_name].dtypes == 'object':
        unique_cat = len(x[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'ILTELKODU' has 79 unique categories
Feature 'ILADI' has 79 unique categories
Feature 'ILCE' has 324 unique categories
Feature 'CINSIYET' has 2 unique categories
Feature 'ODEMESEKLI' has 2 unique categories
Feature 'ACIKLAMA' has 11 unique categories
Feature 'FATURAGONDERIMTIPI' has 3 unique categories
Feature 'RISKLIMUSTERI' has 2 unique categories
Feature 'YAPA_VAE' has 2 unique categories
Feature 'DENVERGIRISTARIHI_1' has 1 unique categories
Feature 'DENVERCIKISTARIHI_1' has 1 unique categories
Feature 'DENVERGIRISTARIHI_2' has 1 unique categories
Feature 'DENVERCIKISTARIHI_2' has 1 unique categories
Feature 'DENVERGIRISTARIHI_3' has 1 unique categories
Feature 'DENVERCIKISTARIHI_3' has 1 unique categories
Feature 'CHURNTEST_DATE' has 1 unique categories


In [18]:
# yüzlerce farklı ilçe var ve one-hot encoding yapmak mantıklı değil, bu datayı kullanmayacağız.
# ILTELKODU ve ILADI aynı veriyi karakterize ediyor, telefon kodunu datadan çıkarmalıyız.

# x['ILCE'].value_counts().sort_values(ascending=False).head(60)
x1 = x.drop('ILCE',1)


In [19]:
# ILTELKODU ve ILADI aynı veriyi karakterize ediyor, telefon kodunu datadan çıkarmalıyız.
# datadaki illerin dağılımınada bakalım, istanbul, ankara, izmir ve bursa domine ediyor, geri kalan illerin adını DIGER olarak değiştirelim.
x1 = x1.drop('ILTELKODU',1)
x1['ILADI'].value_counts().sort_values(ascending=False).head(60)

İstanbul (Avrupa)     594
İstanbul (Anadolu)    302
Ankara                139
İzmir                 101
Bursa                  86
Antalya                54
Kocaeli                54
Muğla                  43
Konya                  42
Adana                  34
Balıkesir              30
Sakarya                30
Tekirdağ               23
Çanakkale              18
Denizli                18
Manisa                 17
Mersin                 16
Hatay                  14
Aydın                  14
Eskişehir              14
Samsun                 14
Kırklareli             13
Afyon                  13
Ordu                   13
Kahramanmaraş          13
Çorum                  12
Kayseri                12
Uşak                   10
Karabük                10
Zonguldak              10
Aksaray                 9
Trabzon                 9
Rize                    9
Giresun                 8
Edirne                  8
Isparta                 8
Sivas                   8
Tokat                   8
Nevşehir    

In [25]:
x1['ILADI'] = ['DIGER' if (z!='İstanbul (Avrupa)' and z!='İstanbul (Anadolu)' and z!='Ankara' and z!='İzmir') else z for z in x1['ILADI']]
x1['ILADI'].value_counts().sort_values(ascending=False).head(5)

DIGER                 863
İstanbul (Avrupa)     594
İstanbul (Anadolu)    302
Ankara                139
İzmir                 101
Name: ILADI, dtype: int64

In [26]:
# 'DENVERGIRISTARIHI_1' has 1 unique categories 
# 'DENVERCIKISTARIHI_1' has 1 unique categories
# 'DENVERGIRISTARIHI_2' has 1 unique categories
# 'DENVERCIKISTARIHI_2' has 1 unique categories
# 'DENVERGIRISTARIHI_3' has 1 unique categories
# 'DENVERCIKISTARIHI_3' has 1 unique categories
# 'CHURNTEST_DATE' has 1 unique categories
# yukarıdaki feature'larda yalnızca 1 tip veri olduğu için bu feature'ları silebiliriz.
x1 = x1.drop('DENVERGIRISTARIHI_1',1)
x1 = x1.drop('DENVERCIKISTARIHI_1',1)
x1 = x1.drop('DENVERGIRISTARIHI_2',1)
x1 = x1.drop('DENVERCIKISTARIHI_2',1)
x1 = x1.drop('DENVERGIRISTARIHI_3',1)
x1 = x1.drop('DENVERCIKISTARIHI_3',1)
x1 = x1.drop('CHURNTEST_DATE',1)
x1.head()

Unnamed: 0,NaN,ILADI,POSILCE,CINSIYET,ABONEBAS,CLOSE_DATE,KALDIGI_AY_SAYISI,ODEMESEKLI,ACIKLAMA,FATURAGONDERIMTIPI,...,INTERNETEBAGLANAMIYORUM_TICKETKAPANMASURESI_3,INTERNETEBAGLANAMIYORUM_TICKETSL_3,BTK_SIKAYET_3,DENVERKALDIGISURE_GUN_3,IKNATICKET_3,PORTERROR_SAYISI_3,MAX_SESSIONTIME_3,MIN_SESSIONTIME_3,TOTALUPLOADGB_3,TOTALDOWNLOADGB_3
0,0,İstanbul (Avrupa),939,E,2016-04-20,NaT,54,K,(Bireysel)(Tam İletişim)(Yapa) 100 Mbps'e Kadar,E-ARŞİV,...,0,0,0,0,0,10,87108,0,16.5013,272.1967
1,1,İstanbul (Avrupa),420,K,2016-04-20,NaT,54,K,(Bireysel)(TN Fiber) 100 Mbps,E-ARŞİV,...,0,0,0,0,0,1,90271,0,135.8656,185.5825
2,2,DIGER,998,E,2016-04-21,NaT,54,K,(Bireysel)(Tam İletişim) 16 Mbpse Kadar,E-ARŞİV,...,0,0,0,0,0,3,90112,0,1.8255,46.6458
3,3,İstanbul (Avrupa),418,E,2016-04-21,NaT,54,B,(Bireysel)(Tam İletişim) 35 Mbps'e Kadar,E-ARŞİV,...,0,0,0,0,0,15,87588,1664,2.0288,42.0181
4,4,İstanbul (Avrupa),425,E,2016-04-21,2020-10-02,54,K,(Bireysel)(Tam İletişim)(Yapa) 100 Mbps'e Kadar,E-ARŞİV,...,0,0,0,0,0,0,0,0,0.0,0.0
