## 1. import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Read Dataset using `pd.read_csv`

In [5]:
df = pd.read_csv("rond10000.csv")

## 3. Review the Data

### 3.1 Display first 5 rows of Dataset

In [7]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 253 36 48,20500000,کارکرده,تهران,18 ثانیه
1,0912 639 49 44,16200000,صفر,تهران,18 ثانیه
2,0912 6 719 709,12700000,صفر,تهران,18 ثانیه
3,0912 359 44 96,16300000,صفر,تهران,18 ثانیه
4,0912 216 0 397,27700000,صفر,تهران,18 ثانیه


### 3.2 Check the number of samples and features

In [8]:
df.shape

(285120, 5)

### 3.3 Check Data types and memory usage

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285120 entries, 0 to 285119
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   phone_number  285120 non-null  object
 1   price         285120 non-null  object
 2   status        285120 non-null  object
 3   city          285120 non-null  object
 4   time          284697 non-null  object
dtypes: object(5)
memory usage: 10.9+ MB


### 3.4 Drop rows with Null values

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(284697, 5)

### 3.5 Drop duplicated rows

In [12]:
df.duplicated().sum()

30727

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.shape

(253970, 5)

## 4. Pre Process Dataset

### 4.1 `price` column

In [15]:
def process_price(price):
    price = price.replace(',' , '')
    if price.isdigit():
        return int(price)
    return np.nan

In [16]:
df.price = df.price.apply(process_price)

In [17]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 253 36 48,20500000.0,کارکرده,تهران,18 ثانیه
1,0912 639 49 44,16200000.0,صفر,تهران,18 ثانیه
2,0912 6 719 709,12700000.0,صفر,تهران,18 ثانیه
3,0912 359 44 96,16300000.0,صفر,تهران,18 ثانیه
4,0912 216 0 397,27700000.0,صفر,تهران,18 ثانیه


In [18]:
df.isnull().sum()

phone_number        0
price           30571
status              0
city                0
time                0
dtype: int64

In [19]:
df.dropna(inplace=True)

In [20]:
df.shape

(223399, 5)

### 4.2 `status` column

In [21]:
status_values = df.status.unique()

In [22]:
status_values

array(['کارکرده', 'صفر', 'در حد صفر'], dtype=object)

In [23]:
status_values[1] , status_values[2] = status_values[2] , status_values[1]

In [24]:
status_map = {status:index for index , status in enumerate(status_values)}

In [25]:
status_map

{'کارکرده': 0, 'در حد صفر': 1, 'صفر': 2}

In [26]:
df.status = df.status.map(status_map)

In [27]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 253 36 48,20500000.0,0,تهران,18 ثانیه
1,0912 639 49 44,16200000.0,2,تهران,18 ثانیه
2,0912 6 719 709,12700000.0,2,تهران,18 ثانیه
3,0912 359 44 96,16300000.0,2,تهران,18 ثانیه
4,0912 216 0 397,27700000.0,2,تهران,18 ثانیه


### 4.3 `city` column

#### 4.3.1 Check city names and their size

In [28]:
cities = df.city.unique()

In [29]:
len(cities)

114

In [30]:
cities

array(['تهران', 'شمیرانات', 'کرج', 'شيراز', 'گرگان', 'ميانه', 'اردبيل',
       'کرمان', 'اهواز', 'مشهد', 'ساری', 'بندر عباس', 'اراک', 'تبريز',
       'رشت', 'اصفهان', 'گراش', 'بابلسر', 'قزوين', 'آمل', 'يزد',
       'کرمانشاه', 'قم', 'اسلام شهر', 'سمنان', 'کاشان', 'شهرضا',
       'لارستان', 'دشتستان', 'زنجان', 'پيرانشهر', 'همدان', 'بابل',
       'رباط کريم', 'دماوند', 'دزفول', 'نيشابور', 'فردیس', 'بوشهر',
       'اليگودرز', 'شهرکرد', 'جهرم', 'ری', 'پارس آباد', 'خنج', 'ابهر',
       'سلماس', 'کهگيلويه', 'قشم', 'بندر ماهشهر', 'مرودشت', 'کازرون',
       'زاهدان', 'اروميه', 'بهارستان', 'آران و بيدگل', 'ساوه', 'گناباد',
       'آباده', 'پیشوا', 'نائين', 'ايلام', 'نجف آباد', 'چالوس',
       'قائم شهر', 'بانه', 'سر دشت', 'سنندج', 'بجنورد', 'شهريار',
       'خرم آباد', 'آبادان', 'قدس', 'بروجن', 'زرنديه', 'ابرکوه', 'ماکو',
       'خوي', 'گنبد کاووس', 'گچساران', 'ايرانشهر', 'بيرجند', 'آشتيان',
       'سيرجان', 'فریدونکنار', 'زابل', 'رفسنجان', 'بروجرد', 'آذر شهر',
       'آبيک', 'بم', 'خمينی شهر',

#### 4.3.2 Encode city names using `category_encoders.BinaryEncoder` 

In [32]:
import category_encoders as ce

In [33]:
binary_encoder = ce.BinaryEncoder(cols=['city'])
df = binary_encoder.fit_transform(df)

In [36]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,time
0,0912 253 36 48,20500000.0,0,0,0,0,0,0,0,1,18 ثانیه
1,0912 639 49 44,16200000.0,2,0,0,0,0,0,0,1,18 ثانیه
2,0912 6 719 709,12700000.0,2,0,0,0,0,0,0,1,18 ثانیه
3,0912 359 44 96,16300000.0,2,0,0,0,0,0,0,1,18 ثانیه
4,0912 216 0 397,27700000.0,2,0,0,0,0,0,0,1,18 ثانیه


### 4.4 `time` column

In [37]:
times = df.time

In [38]:
times

0          18 ثانیه
1          18 ثانیه
2          18 ثانیه
3          18 ثانیه
4          18 ثانیه
            ...    
285115    1 روز پیش
285116    1 روز پیش
285117    1 روز پیش
285118    1 روز پیش
285119    1 روز پیش
Name: time, Length: 223399, dtype: object

#### 4.4.1 Define `regex` patterns for splitting digits and words

In [39]:
import re
pattern = r'(\d+) ([^\d]+)'

In [40]:
res = [re.findall(pattern , text)[0] for text in times]

In [41]:
res[:5]

[('18', 'ثانیه'),
 ('18', 'ثانیه'),
 ('18', 'ثانیه'),
 ('18', 'ثانیه'),
 ('18', 'ثانیه')]

#### 4.4.2 Extract time units

In [42]:
time_units = set()
for units in res:
    time_units.add(units[1])

In [43]:
time_units = list(time_units)
time_units

['ثانیه', 'ساعت', 'روز پیش', 'دقیقه']

In [49]:
time_units[1]

'ساعت'

#### 4.4.3 Convert all `time_units` to seconds

In [50]:
time_units_map = {
    time_units[3]:60 ,
    time_units[2]:60*60*24 ,
    time_units[1]: 3600 ,
    time_units[0]:1
}

In [51]:
preProcessed_time = [int(unit[0]) * time_units_map[unit[1]] for unit in res]

In [52]:
preProcessed_time[:5]

[18, 18, 18, 18, 18]

In [53]:
df.time = preProcessed_time

In [54]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,time
0,0912 253 36 48,20500000.0,0,0,0,0,0,0,0,1,18
1,0912 639 49 44,16200000.0,2,0,0,0,0,0,0,1,18
2,0912 6 719 709,12700000.0,2,0,0,0,0,0,0,1,18
3,0912 359 44 96,16300000.0,2,0,0,0,0,0,0,1,18
4,0912 216 0 397,27700000.0,2,0,0,0,0,0,0,1,18


### 4.5 `phone_number` column

In [55]:
phone_numbers = df.phone_number

In [56]:
def preprocess_phone(num):
    num = num.replace(" ", "")
    if num[0] == '0': num = num[1:]
    if len(num) == 10:
        return num
    return np.nan

In [57]:
phone_numbers = phone_numbers.apply(preprocess_phone)

In [58]:
df.phone_number = phone_numbers

## 5. Drop rows with `NaN` values

In [59]:
df.isnull().sum()

phone_number    0
price           0
status          0
city_0          0
city_1          0
city_2          0
city_3          0
city_4          0
city_5          0
city_6          0
time            0
dtype: int64

In [60]:
df.dropna(inplace=True)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223399 entries, 0 to 285119
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   phone_number  223399 non-null  object 
 1   price         223399 non-null  float64
 2   status        223399 non-null  int64  
 3   city_0        223399 non-null  int64  
 4   city_1        223399 non-null  int64  
 5   city_2        223399 non-null  int64  
 6   city_3        223399 non-null  int64  
 7   city_4        223399 non-null  int64  
 8   city_5        223399 non-null  int64  
 9   city_6        223399 non-null  int64  
 10  time          223399 non-null  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 20.5+ MB


## 6. Feature Extraction

In [62]:
area_code = set()
for num in df.phone_number:
    area_code.add(num[:3])
area_code_lst = list(area_code)

In [63]:
len(area_code_lst)

32

In [65]:
def number_features(df):
    
    #last 7 number all same
    def allCharactersSame(s) :
        n = len(s)
        for i in range(1, n) :
            if s[i] != s[0] :
                return False
        return True
    df['last_7_sim'] = [1 if allCharactersSame(num[3:]) else 0 for num in df['phone_number']]
    
    #last 6 number all same
    df['last_6_sim'] = [1 if (allCharactersSame(num[4:]) and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #first 6 number all same
    df['first_6_sim'] = [1 if (allCharactersSame(num[3:9]) and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #3 pair_last
    df['three_pair_last'] = [1 if ((num[4:6]==num[6:8]) and (num[4:6]==num[8:10])) else 0 for num in df['phone_number']]
    
    #3 pair_first
    df['three_pair_first'] = [1 if ((num[3:5]==num[5:7]) and (num[3:5]==num[7:9])) else 0 for num in df['phone_number']]
    
    #just two number include
    def check_two_num(s):
        my_set = set()
        for i in s:
            my_set.add(i)
        if(len(list(my_set)) == 2):
            return True
        return False
    df['just_two_num'] = [1 if check_two_num(num[3:]) else 0 for num in df['phone_number']]
    
    #last 5 number all same
    df['last_5_sim'] = [1 if (allCharactersSame(num[5:]) and num[4]!=num[5]) else 0 for num in df['phone_number']]
    
    #first 5 number all same
    df['first_5_sim'] = [1 if (allCharactersSame(num[3:8]) and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #double hundreds
    pattern_hundred = r'([1-9]00){2}'
    df['double_100'] = [1 if (len(re.findall(pattern_hundred , num[4:10]))==1 or
                        len(re.findall(pattern_hundred , num[3:9]))==1) else 0 for num in df['phone_number']]
    
    #two triple pair like '532 532'
    df['two_triple_pair'] = [1 if ((num[6:9]==num[3:6]) or (num[4:7]==num[7:10])) else 0 for num in df['phone_number']]
    
    #first 1000
    df['first_1000'] = [1 if ((num[4:6]=='00' and num[3]!='0') or (num[5:7]=='00') and num[4]!='0') else 0 for num in df['phone_number']]
    
    #last 1000
    df['last_1000'] = [1 if ((num[7:9]=='00' and num[6]!='0') or (num[8:10]=='00')) else 0 for num in df['phone_number']]
    
    #last 4 number all same
    df['last_4_sim'] = [1 if (allCharactersSame(num[6:]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #first 4 number all same
    df['first_4_sim'] = [1 if (allCharactersSame(num[3:7]) and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #middle 5 number all same
    df['middle_5_sim'] = [1 if (allCharactersSame(num[4:9]) and num[3]!=num[4] and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['last_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[4:]))==1 and num[-1]=='0') else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['first_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[3:9]))==1 and num[4]=='0') else 0 for num in df['phone_number']]
    
    #last two pair
    df['last_two_pair'] = [1 if (num[6:8]==num[8:10] and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #first two pair
    df['first_two_pair'] = [1 if (num[3:5]==num[5:7] and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #last 3 number all same
    df['last_3_sim'] = [1 if allCharactersSame(num[7:]) and num[6]!=num[7] else 0 for num in df['phone_number']]
    
    #first 3 number all same
    df['first_3_sim'] = [1 if (allCharactersSame(num[3:6]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #repeat area code
    def check_area_code(s):
        for area_code in area_code_lst:
            if area_code in s:
                return True
        return False
    df['area_code_repeat'] = [1 if check_area_code(num[3:]) else 0 for num in df['phone_number']]
    
    #weighted phone number
    df['wheighted_num'] = [1 if (num[3:6] == num[7:10]) else 0 for num in df['phone_number']]
    
    #middle 4 number all same
    df['middle_4_sim'] = [1 if (allCharactersSame(num[4:8]) and num[3]!=num[4] and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #birthday number like '1345'
    df['birthday_num'] = [1 if (num[3:5] == '13') or (num[6:8] == '13') else 0 for num in df['phone_number']]
    
    #middle 3 number all same
    def check_middle_3_condition(num):
        if ((allCharactersSame(num[6:9]) and num[5]!=num[6] and num[8]!=num[9]) or 
            (allCharactersSame(num[5:8]) and num[4]!=num[5] and num[7]!=num[8]) or
                (allCharactersSame(num[4:7]) and num[3]!=num[4] and num[6]!=num[7])):
            return True
        return False
    df['middle_3_sim'] = [1 if check_middle_3_condition(num) else 0 for num in df['phone_number']]
    
    #repeat 2 digits
    pattern_repeat_2_digits = r'([0-9])\1{1}'
    df['repeat_2_digits'] = [1 if len(re.findall(pattern_repeat_2_digits , num[3:]))>=2 else 0 for num in df['phone_number']]
    
    #first 10000
    df['first_10000'] = [1 if ((num[3]!='0' and num[8]!='0') and ((num[4:7]=='000') or (num[5:8]=='000'))) else 0 for num in df['phone_number']]
    
    #last 10000
    df['last_10000'] = [1 if ((num[6:9]=='000' and num[5]!='0') or (num[7:10]=='000')) else 0 for num in df['phone_number']]
    
    #million
    df['million'] = [1 if (num[3]!='0' and ((num[4:9]=='00000') or (num[5:10]=='00000'))) else 0 for num in df['phone_number']]
    # area code encoder
    df['area_code'] = df.phone_number.str[:3]
    df['first_three'] = pd.to_numeric(df.phone_number.str[3:6], errors='coerce')
    df['last_four'] = pd.to_numeric(df.phone_number.str[6:], errors='coerce')


In [66]:
number_features(df)

In [67]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,middle_4_sim,birthday_num,middle_3_sim,repeat_2_digits,first_10000,last_10000,million,area_code,first_three,last_four
0,9122533648,20500000.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,912,253.0,3648.0
1,9126394944,16200000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,912,639.0,4944.0
2,9126719709,12700000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,912,671.0,9709.0
3,9123594496,16300000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,912,359.0,4496.0
4,9122160397,27700000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,912,216.0,397.0


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223399 entries, 0 to 285119
Data columns (total 44 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   phone_number      223399 non-null  object 
 1   price             223399 non-null  float64
 2   status            223399 non-null  int64  
 3   city_0            223399 non-null  int64  
 4   city_1            223399 non-null  int64  
 5   city_2            223399 non-null  int64  
 6   city_3            223399 non-null  int64  
 7   city_4            223399 non-null  int64  
 8   city_5            223399 non-null  int64  
 9   city_6            223399 non-null  int64  
 10  time              223399 non-null  int64  
 11  last_7_sim        223399 non-null  int64  
 12  last_6_sim        223399 non-null  int64  
 13  first_6_sim       223399 non-null  int64  
 14  three_pair_last   223399 non-null  int64  
 15  three_pair_first  223399 non-null  int64  
 16  just_two_num      22

In [70]:
area_code_bin_enc = ce.BinaryEncoder(cols=['area_code'])
df = area_code_bin_enc.fit_transform(df)

In [71]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,last_10000,million,area_code_0,area_code_1,area_code_2,area_code_3,area_code_4,area_code_5,first_three,last_four
0,9122533648,20500000.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,253.0,3648.0
1,9126394944,16200000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,639.0,4944.0
2,9126719709,12700000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,671.0,9709.0
3,9123594496,16300000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,359.0,4496.0
4,9122160397,27700000.0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,216.0,397.0


## 7. Save the Pre Processed Dataset

In [409]:
df.to_csv('preprocessed_with_features_rond1000.csv')