## 1. import necessary libraries

In [351]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Read Dataset using `pd.read_csv`

In [352]:
df = pd.read_csv("rond1000.csv")

## 3. Review the Data

### 3.1 Display first 5 rows of Dataset

In [353]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 69 88 709,10250000,صفر,تهران,6 ثانیه
1,0912 672 0 962,11500000,صفر,تهران,6 ثانیه
2,0912 538 67 32,11150000,صفر,تهران,6 ثانیه
3,0912 794 7800,17600000,صفر,تهران,6 ثانیه
4,0912 216 91 46,27700000,صفر,تهران,6 ثانیه


### 3.2 Check the number of samples and features

In [354]:
df.shape

(40000, 5)

### 3.3 Check Data types and memory usage

In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   phone_number  40000 non-null  object
 1   price         40000 non-null  object
 2   status        40000 non-null  object
 3   city          40000 non-null  object
 4   time          39687 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


### 3.4 Drop rows with Null values

In [356]:
df.dropna(inplace=True)

In [357]:
df.shape

(39687, 5)

### 3.5 Drop duplicated rows

In [358]:
df.duplicated().sum()

1454

In [359]:
df.drop_duplicates(inplace=True)

In [360]:
df.shape

(38233, 5)

## 4. Pre Process Dataset

### 4.1 `price` column

In [361]:
def process_price(price):
    price = price.replace(',' , '')
    if price.isdigit():
        return int(price)
    return np.nan

In [362]:
df.price = df.price.apply(process_price)

In [363]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 69 88 709,10250000.0,صفر,تهران,6 ثانیه
1,0912 672 0 962,11500000.0,صفر,تهران,6 ثانیه
2,0912 538 67 32,11150000.0,صفر,تهران,6 ثانیه
3,0912 794 7800,17600000.0,صفر,تهران,6 ثانیه
4,0912 216 91 46,27700000.0,صفر,تهران,6 ثانیه


In [364]:
#df.isnull().sum()

In [365]:
#df.dropna(inplace=True)

In [366]:
#df.shape

### 4.2 `status` column

In [367]:
status_values = df.status.unique()

In [368]:
status_values

array(['صفر', 'کارکرده', 'در حد صفر'], dtype=object)

In [369]:
status_values[1] , status_values[2] = status_values[2] , status_values[1]

In [370]:
status_map = {status:index for index , status in enumerate(status_values)}

In [371]:
status_map

{'صفر': 0, 'در حد صفر': 1, 'کارکرده': 2}

In [372]:
df.status = df.status.map(status_map)

In [373]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 69 88 709,10250000.0,0,تهران,6 ثانیه
1,0912 672 0 962,11500000.0,0,تهران,6 ثانیه
2,0912 538 67 32,11150000.0,0,تهران,6 ثانیه
3,0912 794 7800,17600000.0,0,تهران,6 ثانیه
4,0912 216 91 46,27700000.0,0,تهران,6 ثانیه


### 4.3 `city` column

#### 4.3.1 Check city names and their size

In [374]:
cities = df.city.unique()

In [375]:
len(cities)

55

In [376]:
cities

array(['تهران', 'گرگان', 'آمل', 'شمیرانات', 'اصفهان', 'شيراز', 'گناوه',
       'جهرم', 'رشت', 'کرج', 'قم', 'يزد', 'کرمان', 'اهواز', 'کرمانشاه',
       'دماوند', 'همدان', 'اردبيل', 'مشهد', 'شهرضا', 'بابلسر', 'قزوين',
       'کاشان', 'سمنان', 'ميانه', 'بندر عباس', 'ساری', 'اراک', 'تبريز',
       'گراش', 'خنج', 'لارستان', 'بهارستان', 'ورامين', 'دشتستان',
       'پيرانشهر', 'بابل', 'اسلام شهر', 'زنجان', 'نيشابور', 'دزفول',
       'فردیس', 'رفسنجان', 'بوشهر', 'شهرکرد', 'اليگودرز', 'پارس آباد',
       'ری', 'آبادان', 'شهريار', 'اروميه', 'گنبد کاووس', 'ابهر', 'گناباد',
       'بندر ماهشهر'], dtype=object)

#### 4.3.2 Encode city names using `category_encoders.BinaryEncoder` 

In [377]:
import category_encoders as ce

In [378]:
binary_encoder = ce.BinaryEncoder(cols=['city'])
df = binary_encoder.fit_transform(df)

In [379]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,time
0,0912 69 88 709,10250000.0,0,0,0,0,0,0,1,6 ثانیه
1,0912 672 0 962,11500000.0,0,0,0,0,0,0,1,6 ثانیه
2,0912 538 67 32,11150000.0,0,0,0,0,0,0,1,6 ثانیه
3,0912 794 7800,17600000.0,0,0,0,0,0,0,1,6 ثانیه
4,0912 216 91 46,27700000.0,0,0,0,0,0,0,1,6 ثانیه


### 4.4 `time` column

In [380]:
times = df.time

In [381]:
times

0        6 ثانیه
1        6 ثانیه
2        6 ثانیه
3        6 ثانیه
4        6 ثانیه
          ...   
39995    5 دقیقه
39996    5 دقیقه
39997    5 دقیقه
39998    5 دقیقه
39999    5 دقیقه
Name: time, Length: 38233, dtype: object

#### 4.4.1 Define `regex` patterns for splitting digits and words

In [382]:
import re
pattern = r'(\d+) ([^\d]+)'

In [383]:
res = [re.findall(pattern , text)[0] for text in times]

In [384]:
res[:5]

[('6', 'ثانیه'),
 ('6', 'ثانیه'),
 ('6', 'ثانیه'),
 ('6', 'ثانیه'),
 ('6', 'ثانیه')]

#### 4.4.2 Extract time units

In [385]:
time_units = set()
for units in res:
    time_units.add(units[1])

In [386]:
time_units = list(time_units)
time_units

['ساعت', 'ثانیه', 'روز پیش', 'دقیقه']

In [387]:
time_units[0]

'ساعت'

#### 4.4.3 Convert all `time_units` to seconds

In [388]:
time_units_map = {
    time_units[3]:60 ,
    time_units[2]:60*60*24 ,
    time_units[1]: 1 ,
    time_units[0]:60*60
}

In [389]:
preProcessed_time = [int(unit[0]) * time_units_map[unit[1]] for unit in res]

In [390]:
preProcessed_time[:5]

[6, 6, 6, 6, 6]

In [391]:
df.time = preProcessed_time

In [392]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,time
0,0912 69 88 709,10250000.0,0,0,0,0,0,0,1,6
1,0912 672 0 962,11500000.0,0,0,0,0,0,0,1,6
2,0912 538 67 32,11150000.0,0,0,0,0,0,0,1,6
3,0912 794 7800,17600000.0,0,0,0,0,0,0,1,6
4,0912 216 91 46,27700000.0,0,0,0,0,0,0,1,6


### 4.5 `phone_number` column

In [393]:
phone_numbers = df.phone_number

In [394]:
def preprocess_phone(num):
    num = num.replace(" ", "")
    if num[0] == '0': num = num[1:]
    if len(num) == 10:
        return num
    return np.nan

In [395]:
phone_numbers = phone_numbers.apply(preprocess_phone)

In [396]:
df.phone_number = phone_numbers

## 5. Drop rows with `NaN` values

In [397]:
df.isnull().sum()

phone_number      0
price           431
status            0
city_0            0
city_1            0
city_2            0
city_3            0
city_4            0
city_5            0
time              0
dtype: int64

In [398]:
df.dropna(inplace=True)

In [399]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37802 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   phone_number  37802 non-null  object 
 1   price         37802 non-null  float64
 2   status        37802 non-null  int64  
 3   city_0        37802 non-null  int64  
 4   city_1        37802 non-null  int64  
 5   city_2        37802 non-null  int64  
 6   city_3        37802 non-null  int64  
 7   city_4        37802 non-null  int64  
 8   city_5        37802 non-null  int64  
 9   time          37802 non-null  int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 3.2+ MB


## 6. Feature Extraction

In [400]:
area_code = set()
for num in df.phone_number:
    area_code.add(num[:3])
area_code_lst = list(area_code)

In [401]:
len(area_code_lst)

32

In [402]:
def number_features(df):
    
    #last 7 number all same
    def allCharactersSame(s) :
        n = len(s)
        for i in range(1, n) :
            if s[i] != s[0] :
                return False
        return True
    df['last_7_sim'] = [1 if allCharactersSame(num[3:]) else 0 for num in df['phone_number']]
    
    #last 6 number all same
    df['last_6_sim'] = [1 if (allCharactersSame(num[4:]) and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #first 6 number all same
    df['first_6_sim'] = [1 if (allCharactersSame(num[3:9]) and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #3 pair_last
    df['three_pair_last'] = [1 if ((num[4:6]==num[6:8]) and (num[4:6]==num[8:10])) else 0 for num in df['phone_number']]
    
    #3 pair_first
    df['three_pair_first'] = [1 if ((num[3:5]==num[5:7]) and (num[3:5]==num[7:9])) else 0 for num in df['phone_number']]
    
    #just two number include
    def check_two_num(s):
        my_set = set()
        for i in s:
            my_set.add(i)
        if(len(list(my_set)) == 2):
            return True
        return False
    df['just_two_num'] = [1 if check_two_num(num[3:]) else 0 for num in df['phone_number']]
    
    #last 5 number all same
    df['last_5_sim'] = [1 if (allCharactersSame(num[5:]) and num[4]!=num[5]) else 0 for num in df['phone_number']]
    
    #first 5 number all same
    df['first_5_sim'] = [1 if (allCharactersSame(num[3:8]) and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #double hundreds
    pattern_hundred = r'([1-9]00){2}'
    df['double_100'] = [1 if (len(re.findall(pattern_hundred , num[4:10]))==1 or
                        len(re.findall(pattern_hundred , num[3:9]))==1) else 0 for num in df['phone_number']]
    
    #two triple pair like '532 532'
    df['two_triple_pair'] = [1 if ((num[6:9]==num[3:6]) or (num[4:7]==num[7:10])) else 0 for num in df['phone_number']]
    
    #first 1000
    df['first_1000'] = [1 if ((num[4:6]=='00' and num[3]!='0') or (num[5:7]=='00') and num[4]!='0') else 0 for num in df['phone_number']]
    
    #last 1000
    df['last_1000'] = [1 if ((num[7:9]=='00' and num[6]!='0') or (num[8:10]=='00')) else 0 for num in df['phone_number']]
    
    #last 4 number all same
    df['last_4_sim'] = [1 if (allCharactersSame(num[6:]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #first 4 number all same
    df['first_4_sim'] = [1 if (allCharactersSame(num[3:7]) and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #middle 5 number all same
    df['middle_5_sim'] = [1 if (allCharactersSame(num[4:9]) and num[3]!=num[4] and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['last_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[4:]))==1 and num[-1]=='0') else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['first_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[3:9]))==1 and num[4]=='0') else 0 for num in df['phone_number']]
    
    #last two pair
    df['last_two_pair'] = [1 if (num[6:8]==num[8:10] and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #first two pair
    df['first_two_pair'] = [1 if (num[3:5]==num[5:7] and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #last 3 number all same
    df['last_3_sim'] = [1 if allCharactersSame(num[7:]) and num[6]!=num[7] else 0 for num in df['phone_number']]
    
    #first 3 number all same
    df['first_3_sim'] = [1 if (allCharactersSame(num[3:6]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #repeat area code
    def check_area_code(s):
        for area_code in area_code_lst:
            if area_code in s:
                return True
        return False
    df['area_code_repeat'] = [1 if check_area_code(num[3:]) else 0 for num in df['phone_number']]
    
    #weighted phone number
    df['wheighted_num'] = [1 if (num[3:6] == num[7:10]) else 0 for num in df['phone_number']]
    
    #middle 4 number all same
    df['middle_4_sim'] = [1 if (allCharactersSame(num[4:8]) and num[3]!=num[4] and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #birthday number like '1345'
    df['birthday_num'] = [1 if (num[3:5] == '13') or (num[6:8] == '13') else 0 for num in df['phone_number']]
    
    #middle 3 number all same
    def check_middle_3_condition(num):
        if ((allCharactersSame(num[6:9]) and num[5]!=num[6] and num[8]!=num[9]) or 
            (allCharactersSame(num[5:8]) and num[4]!=num[5] and num[7]!=num[8]) or
                (allCharactersSame(num[4:7]) and num[3]!=num[4] and num[6]!=num[7])):
            return True
        return False
    df['middle_3_sim'] = [1 if check_middle_3_condition(num) else 0 for num in df['phone_number']]
    
    #repeat 2 digits
    pattern_repeat_2_digits = r'([0-9])\1{1}'
    df['repeat_2_digits'] = [1 if len(re.findall(pattern_repeat_2_digits , num[3:]))>=2 else 0 for num in df['phone_number']]
    
    #first 10000
    df['first_10000'] = [1 if ((num[3]!='0' and num[8]!='0') and ((num[4:7]=='000') or (num[5:8]=='000'))) else 0 for num in df['phone_number']]
    
    #last 10000
    df['last_10000'] = [1 if ((num[6:9]=='000' and num[5]!='0') or (num[7:10]=='000')) else 0 for num in df['phone_number']]
    
    #million
    df['million'] = [1 if (num[3]!='0' and ((num[4:9]=='00000') or (num[5:10]=='00000'))) else 0 for num in df['phone_number']]
    # area code encoder
    df['area_code'] = df.phone_number.str[:3]
    df['first_three'] = pd.to_numeric(df.phone_number.str[3:6], errors='coerce')
    df['last_four'] = pd.to_numeric(df.phone_number.str[6:], errors='coerce')


In [403]:
number_features(df)

In [404]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,time,...,middle_4_sim,birthday_num,middle_3_sim,repeat_2_digits,first_10000,last_10000,million,area_code,first_three,last_four
0,9126988709,10250000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,912,698.0,8709.0
1,9126720962,11500000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,912,672.0,962.0
2,9125386732,11150000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,912,538.0,6732.0
3,9127947800,17600000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,912,794.0,7800.0
4,9122169146,27700000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,912,216.0,9146.0


In [405]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37802 entries, 0 to 39999
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   phone_number      37802 non-null  object 
 1   price             37802 non-null  float64
 2   status            37802 non-null  int64  
 3   city_0            37802 non-null  int64  
 4   city_1            37802 non-null  int64  
 5   city_2            37802 non-null  int64  
 6   city_3            37802 non-null  int64  
 7   city_4            37802 non-null  int64  
 8   city_5            37802 non-null  int64  
 9   time              37802 non-null  int64  
 10  last_7_sim        37802 non-null  int64  
 11  last_6_sim        37802 non-null  int64  
 12  first_6_sim       37802 non-null  int64  
 13  three_pair_last   37802 non-null  int64  
 14  three_pair_first  37802 non-null  int64  
 15  just_two_num      37802 non-null  int64  
 16  last_5_sim        37802 non-null  int64 

In [406]:
df[df['phone_number']=='9197000005']

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,time,...,middle_4_sim,birthday_num,middle_3_sim,repeat_2_digits,first_10000,last_10000,million,area_code,first_three,last_four
7417,9197000005,380000000.0,0,0,0,0,0,0,1,660,...,0,0,0,1,0,0,1,919,700.0,5.0


In [407]:
area_code_bin_enc = ce.BinaryEncoder(cols=['area_code'])
df = area_code_bin_enc.fit_transform(df)

In [408]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,time,...,last_10000,million,area_code_0,area_code_1,area_code_2,area_code_3,area_code_4,area_code_5,first_three,last_four
0,9126988709,10250000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,1,698.0,8709.0
1,9126720962,11500000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,1,672.0,962.0
2,9125386732,11150000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,1,538.0,6732.0
3,9127947800,17600000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,1,794.0,7800.0
4,9122169146,27700000.0,0,0,0,0,0,0,1,6,...,0,0,0,0,0,0,0,1,216.0,9146.0


## 7. Save the Pre Processed Dataset

In [409]:
df.to_csv('preprocessed_with_features_rond1000.csv')